diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index dfbfaa8c894f5..d448030b4beb3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1049,6 +1049,9 @@ void SelectionDAG::RemoveDeadNode(SDNode *N){ } void SelectionDAG::DeleteNode(SDNode *N) { + for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) + DUL->NodeDeleted(N, nullptr); + // First take this out of the appropriate CSE map. RemoveNodeFromCSEMaps(N); diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index e2d55990473c0..e55c7bd3ac5e6 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1969,19 +1969,19 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: v_mov_b32_e32 v14, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x50 +; VI-NEXT: s_add_u32 s2, s0, 0x70 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: v_mov_b32_e32 v16, s2 -; VI-NEXT: s_add_u32 s2, s0, 64 +; VI-NEXT: s_add_u32 s2, s0, 0x60 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v11, s1 ; VI-NEXT: v_mov_b32_e32 v18, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: s_add_u32 s2, s0, 0x50 ; VI-NEXT: v_mov_b32_e32 v10, s0 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: s_add_u32 s0, s0, 0x60 +; VI-NEXT: s_add_u32 s0, s0, 64 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v22, v4 @@ -1995,15 +1995,15 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v29, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v30, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 @@ -2040,40 +2040,40 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; GFX11-NEXT: global_load_b128 v[0:3], v32, s[2:3] ; GFX11-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v7 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v23, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v22, v6 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v22, v5 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v23 -; GFX11-NEXT: v_cvt_f32_f16_e32 v34, v11 -; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v34, v11 +; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v19 +; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v4 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 +; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v15 ; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v22 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v10 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v18 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v11 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v23 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v7 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v22 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v6 ; GFX11-NEXT: v_cvt_f32_f16_e32 v33, v9 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v15 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v7 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v18 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v11 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v14 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v6 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v10 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 @@ -2082,10 +2082,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v34 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v33 ; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:80 -; GFX11-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:64 -; GFX11-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112 -; GFX11-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96 +; GFX11-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112 +; GFX11-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96 +; GFX11-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80 +; GFX11-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64 ; GFX11-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48 ; GFX11-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 88b18232ef9c8..d4b3cc3a656ba 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -5719,33 +5719,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_bfe_u32 v2, v29, 11, 1 -; GFX6-NEXT: v_bfe_u32 v0, v29, 10, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NEXT: v_bfe_u32 v5, v29, 9, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v29 +; GFX6-NEXT: v_bfe_u32 v0, v29, 14, 1 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NEXT: v_bfe_u32 v5, v29, 13, 1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_u32 v3, v29, 8, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:64 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 15, v29 +; GFX6-NEXT: v_bfe_u32 v3, v29, 12, 1 +; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 +; GFX6-NEXT: v_bfe_u32 v8, v29, 11, 1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_u32 v6, v29, 14, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NEXT: v_bfe_u32 v27, v29, 5, 1 -; GFX6-NEXT: v_bfe_u32 v23, v29, 7, 1 -; GFX6-NEXT: v_bfe_u32 v19, v29, 1, 1 -; GFX6-NEXT: v_bfe_u32 v15, v29, 3, 1 -; GFX6-NEXT: v_bfe_u32 v11, v29, 13, 1 -; GFX6-NEXT: v_bfe_u32 v25, v29, 4, 1 -; GFX6-NEXT: v_bfe_u32 v21, v29, 6, 1 -; GFX6-NEXT: v_and_b32_e32 v17, 1, v29 -; GFX6-NEXT: v_bfe_u32 v13, v29, 2, 1 +; GFX6-NEXT: v_bfe_u32 v6, v29, 10, 1 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_bfe_u32 v27, v29, 1, 1 +; GFX6-NEXT: v_bfe_u32 v23, v29, 3, 1 +; GFX6-NEXT: v_bfe_u32 v19, v29, 5, 1 +; GFX6-NEXT: v_bfe_u32 v15, v29, 7, 1 +; GFX6-NEXT: v_bfe_u32 v11, v29, 9, 1 +; GFX6-NEXT: v_and_b32_e32 v25, 1, v29 +; GFX6-NEXT: v_bfe_u32 v21, v29, 2, 1 +; GFX6-NEXT: v_bfe_u32 v17, v29, 4, 1 +; GFX6-NEXT: v_bfe_u32 v13, v29, 6, 1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_u32 v9, v29, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96 -; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 -; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:32 +; GFX6-NEXT: v_bfe_u32 v9, v29, 8, 1 +; GFX6-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: @@ -5761,7 +5761,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: s_add_u32 s4, s0, 0x50 +; GFX8-NEXT: s_add_u32 s4, s0, 0x60 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v23, s5 ; GFX8-NEXT: v_mov_b32_e32 v22, s4 @@ -5775,9 +5775,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v21, v2 ; GFX8-NEXT: v_mov_b32_e32 v25, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 10, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 12, v0 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 13, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8] @@ -5786,31 +5786,31 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v22, s2 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4] ; GFX8-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4] ; GFX8-NEXT: v_mov_b32_e32 v23, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 10, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0 -; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v6, 1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0 ; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0 +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 9, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 @@ -5940,35 +5940,33 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v28, 1, v0 -; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0 -; GFX12-NEXT: v_lshrrev_b16 v8, 9, v0 -; GFX12-NEXT: v_lshrrev_b16 v12, 13, v0 +; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0 +; GFX12-NEXT: v_lshrrev_b16 v8, 11, v0 +; GFX12-NEXT: v_lshrrev_b16 v12, 9, v0 ; GFX12-NEXT: v_lshrrev_b16 v16, 7, v0 ; GFX12-NEXT: v_lshrrev_b16 v2, 15, v0 ; GFX12-NEXT: v_lshrrev_b16 v6, 14, v0 -; GFX12-NEXT: v_lshrrev_b16 v10, 10, v0 ; GFX12-NEXT: v_lshrrev_b16 v20, 5, v0 ; GFX12-NEXT: v_lshrrev_b16 v24, 3, v0 ; GFX12-NEXT: v_lshrrev_b16 v32, 1, v0 +; GFX12-NEXT: v_lshrrev_b16 v10, 12, v0 ; GFX12-NEXT: v_and_b32_e32 v33, 1, v4 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8 -; GFX12-NEXT: v_lshrrev_b16 v14, 8, v0 -; GFX12-NEXT: v_lshrrev_b16 v18, 12, v0 +; GFX12-NEXT: v_lshrrev_b16 v14, 10, v0 +; GFX12-NEXT: v_lshrrev_b16 v18, 8, v0 ; GFX12-NEXT: v_and_b32_e32 v35, 1, v12 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16 ; GFX12-NEXT: v_lshrrev_b16 v22, 6, v0 ; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32 -; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10 ; GFX12-NEXT: v_mov_b32_e32 v23, v1 ; GFX12-NEXT: v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2 -; GFX12-NEXT: v_mov_b32_e32 v31, v1 ; GFX12-NEXT: v_lshrrev_b16 v26, 4, v0 ; GFX12-NEXT: v_lshrrev_b16 v30, 2, v0 ; GFX12-NEXT: v_and_b32_e32 v37, 1, v20 -; GFX12-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX12-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v0, 1, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 1, v10 ; GFX12-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30 ; GFX12-NEXT: v_and_b32_e32 v8, 1, v14 ; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34 @@ -5978,13 +5976,13 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v35 ; GFX12-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36 ; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32 -; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38 +; GFX12-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_and_b32 v26, 0xffff, v38 ; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37 ; GFX12-NEXT: s_clause 0x7 -; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:64 ; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48 ; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32 ; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a87fa8bf36d9e..4ddde98f98825 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -6652,27 +6652,27 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s1, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s4, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 @@ -6682,13 +6682,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 @@ -6709,33 +6709,33 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s15, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16 ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-HSA-NEXT: s_and_b32 s3, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s11, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6748,22 +6748,22 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6795,41 +6795,41 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s11, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6949,30 +6949,30 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_and_b32 s10, s5, 0xffff +; GFX12-NEXT: s_and_b32 s10, s7, 0xffff ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10 -; GFX12-NEXT: s_lshr_b32 s5, s5, 16 +; GFX12-NEXT: s_lshr_b32 s7, s7, 16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: s_lshr_b32 s7, s6, 16 +; GFX12-NEXT: s_and_b32 s6, s6, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s7 +; GFX12-NEXT: s_lshr_b32 s6, s5, 16 +; GFX12-NEXT: s_and_b32 s5, s5, 0xffff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96 +; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: s_lshr_b32 s5, s4, 16 ; GFX12-NEXT: s_and_b32 s4, s4, 0xffff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_mov_b32_e32 v2, s5 -; GFX12-NEXT: s_lshr_b32 s4, s7, 16 -; GFX12-NEXT: s_and_b32 s5, s7, 0xffff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64 -; GFX12-NEXT: v_mov_b32_e32 v0, s5 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_lshr_b32 s4, s6, 16 -; GFX12-NEXT: s_and_b32 s5, s6, 0xffff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112 -; GFX12-NEXT: v_mov_b32_e32 v0, s5 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 ; GFX12-NEXT: s_and_b32 s3, s3, 0xffff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 @@ -7195,57 +7195,58 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s1, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[18:19], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s8, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s8, 0x60 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 @@ -7385,17 +7386,17 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s30, s5 -; GFX12-NEXT: s_lshr_b32 s34, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 -; GFX12-NEXT: s_lshr_b32 s4, s4, 16 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GFX12-NEXT: s_mov_b32 s24, s7 -; GFX12-NEXT: s_lshr_b32 s26, s7, 16 +; GFX12-NEXT: s_mov_b32 s30, s7 +; GFX12-NEXT: s_lshr_b32 s34, s7, 16 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000 +; GFX12-NEXT: s_lshr_b32 s6, s6, 16 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 +; GFX12-NEXT: s_mov_b32 s24, s5 +; GFX12-NEXT: s_lshr_b32 s26, s5, 16 ; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GFX12-NEXT: s_lshr_b32 s6, s6, 16 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GFX12-NEXT: s_lshr_b32 s4, s4, 16 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s29 ; GFX12-NEXT: s_mov_b32 s18, s3 ; GFX12-NEXT: s_lshr_b32 s20, s3, 16 @@ -7403,30 +7404,30 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31 ; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35 -; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000 ; GFX12-NEXT: s_lshr_b32 s2, s2, 16 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s25 ; GFX12-NEXT: s_mov_b32 s12, s1 ; GFX12-NEXT: s_lshr_b32 s14, s1, 16 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 ; GFX12-NEXT: s_lshr_b32 s0, s0, 16 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19 +; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v17, s19 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21 ; GFX12-NEXT: v_mov_b32_e32 v18, s20 ; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:96 ; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12 @@ -7434,8 +7435,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10 ; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:64 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 889755c23bbc7..2cd0172f6bbe3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6533,53 +6533,53 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s6, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s7, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s4, 0x80008 -; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s5, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s5, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s17, s4, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s18, s7, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s19, s6, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s17, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s18, s6, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s19, s7, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s19 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: @@ -6590,70 +6590,70 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s11, s6, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s12, s7, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s13, s4, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s14, s5, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s15, s5, 0xff -; GFX7-HSA-NEXT: s_and_b32 s16, s4, 0xff -; GFX7-HSA-NEXT: s_and_b32 s17, s7, 0xff -; GFX7-HSA-NEXT: s_and_b32 s18, s6, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s9, s5, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s7, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s11, s7, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s12, s6, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s13, s5, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s14, s4, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s15, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff +; GFX7-HSA-NEXT: s_and_b32 s17, s6, 0xff +; GFX7-HSA-NEXT: s_and_b32 s18, s7, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s3, s6, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s3, s7, 0x80010 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 @@ -6678,9 +6678,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 ; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s5 -; GFX8-NOHSA-NEXT: s_and_b32 s13, s7, 0xff -; GFX8-NOHSA-NEXT: s_and_b32 s14, s6, 0xff -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s6 +; GFX8-NOHSA-NEXT: s_and_b32 s13, s6, 0xff +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s6 +; GFX8-NOHSA-NEXT: s_and_b32 s14, s7, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 ; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 @@ -6710,20 +6710,20 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 @@ -6844,8 +6844,8 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_lshr_b32 s2, s5, 24 ; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010 -; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6 -; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7 +; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6 ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -6860,20 +6860,20 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 ; GFX12-NEXT: v_mov_b32_e32 v0, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: s_and_b32 s2, s6, 0xff +; GFX12-NEXT: s_and_b32 s2, s7, 0xff ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5 -; GFX12-NEXT: s_and_b32 s2, s7, 0xff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: s_and_b32 s2, s6, 0xff +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 ; GFX12-NEXT: s_and_b32 s2, s5, 0xff -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96 +; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 21e27bfa75531..da8fd23e92b07 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -6490,23 +6490,23 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21 @@ -6515,12 +6515,12 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -6547,68 +6547,68 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v8 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[14:17] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[17:20] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[10:13] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[14:17] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[24:27] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -6623,25 +6623,21 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v29 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v29 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v5 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 @@ -6649,20 +6645,24 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -7012,50 +7012,50 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll index 1d87a5773296a..dd4cb77a89757 100644 --- a/llvm/test/CodeGen/X86/pr40730.ll +++ b/llvm/test/CodeGen/X86/pr40730.ll @@ -17,10 +17,14 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) { } ; CHECK: .LCPI1_0: -; CHECK-NEXT: .quad 0x0000000e0000000d -; CHECK-NEXT: .quad 0x0000000e0000000d -; CHECK-NEXT: .quad 0x0000001000000000 -; CHECK-NEXT: .quad 0x0000000e0000000d +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .long 13 +; CHECK-NEXT: .long 14 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .long 16 define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0) { ; CHECK-LABEL: shuffle_v8i32_0dcd3f14_constant: @@ -29,7 +33,9 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0) { ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7] +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %a0, <8 x i32> , <8 x i32> ret <8 x i32> %res diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 8e55cb48cf7a2..528646dfd8e2c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -476,19 +476,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 ; AVX512-NEXT: vmovd %xmm2, %eax ; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3] +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) +; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) ; AVX512-NEXT: retq @@ -507,19 +507,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovd %xmm2, %eax ; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: retq @@ -542,19 +542,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 ; AVX512DQ-NEXT: vmovd %xmm2, %eax ; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-NEXT: retq @@ -573,19 +573,19 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax ; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 32825f291e98b..3400b22f2b1c3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -4652,12 +4652,12 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm1 @@ -4690,14 +4690,14 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm29 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm30 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm29 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX512-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] @@ -4723,8 +4723,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vpermd %zmm7, %zmm31, %zmm7 ; AVX512-FCP-NEXT: vpermd %zmm13, %zmm0, %zmm7 {%k1} ; AVX512-FCP-NEXT: vpermd %zmm8, %zmm31, %zmm8 @@ -4751,36 +4751,35 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm4, %zmm12 {%k1} ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm20, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 {%k2} # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 {%k2} # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm13, %zmm13 -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpermd %zmm15, %zmm4, %zmm15 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm4, %zmm15 {%k2} -; AVX512-FCP-NEXT: vpermd %zmm10, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermd %zmm9, %zmm4, %zmm10 {%k2} +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm2, %zmm13 {%k2} +; AVX512-FCP-NEXT: vpermd %zmm15, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vpermd %zmm14, %zmm2, %zmm15 {%k2} +; AVX512-FCP-NEXT: vpermd %zmm9, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpermd %zmm10, %zmm2, %zmm9 {%k2} ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 448(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512-FCP-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -4994,12 +4993,12 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm1 @@ -5032,14 +5031,14 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm28 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm29 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm30 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm29 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX512DQ-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] @@ -5065,8 +5064,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 ; AVX512DQ-FCP-NEXT: vpermd %zmm7, %zmm31, %zmm7 ; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm0, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm31, %zmm8 @@ -5093,36 +5092,35 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm4, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 {%k2} # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 {%k2} # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm13, %zmm13 -; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm4, %zmm15 {%k2} -; AVX512DQ-FCP-NEXT: vpermd %zmm10, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm4, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm2, %zmm13 {%k2} +; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm2, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpermd %zmm10, %zmm2, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512DQ-FCP-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -8918,20 +8916,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-FCP-LABEL: store_i16_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512-FCP-NEXT: subq $2376, %rsp # imm = 0x948 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm12 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%r10), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9097,46 +9096,47 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm5 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm19 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm20 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm16 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm8 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm14 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 @@ -9152,9 +9152,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm1, %zmm24 {%k2} ; AVX512-FCP-NEXT: vpermd %zmm18, %zmm0, %zmm25 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm1, %zmm25 {%k2} -; AVX512-FCP-NEXT: vpermd %zmm15, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpermd %zmm17, %zmm1, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermd %zmm15, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpermd %zmm17, %zmm1, %zmm10 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermd %zmm7, %zmm0, %zmm30 ; AVX512-FCP-NEXT: vpermd %zmm13, %zmm1, %zmm30 {%k2} ; AVX512-FCP-NEXT: vpermd %zmm19, %zmm0, %zmm7 @@ -9180,14 +9180,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpermd %zmm8, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermd %zmm11, %zmm1, %zmm8 {%k1} ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 {%k2} # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 {%k2} # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 {%k2} # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload @@ -9198,65 +9198,64 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %zmm27, %zmm0, %zmm26 {%k2} ; AVX512-FCP-NEXT: vpermd %zmm21, %zmm0, %zmm21 ; AVX512-FCP-NEXT: vpermd %zmm23, %zmm0, %zmm21 {%k2} -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm0, %zmm23 {%k2} -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpermd %zmm12, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermd %zmm14, %zmm0, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] +; AVX512-FCP-NEXT: vpermd %zmm5, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm27 -; AVX512-FCP-NEXT: vpermd %zmm27, %zmm11, %zmm27 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm28 -; AVX512-FCP-NEXT: vpermd %zmm28, %zmm11, %zmm28 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm28 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm29 -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm11, %zmm29 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm29 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm31 -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm11, %zmm31 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpermd %zmm8, %zmm11, %zmm8 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm27 +; AVX512-FCP-NEXT: vpermd %zmm27, %zmm14, %zmm27 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm28 +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm14, %zmm28 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm29 +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm14, %zmm29 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm31 +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm14, %zmm31 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpermd %zmm10, %zmm14, %zmm10 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vpermd %zmm18, %zmm11, %zmm18 -; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm18 {%k1} +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm14, %zmm18 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm18 {%k1} ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} @@ -9266,16 +9265,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) @@ -9287,8 +9286,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 960(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512-FCP-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512-FCP-NEXT: addq $2376, %rsp # imm = 0x948 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -9568,20 +9567,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-FCP-LABEL: store_i16_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512DQ-FCP-NEXT: subq $2376, %rsp # imm = 0x948 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9747,46 +9747,47 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm8 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm14 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movw $-30584, %ax # imm = 0x8888 @@ -9802,9 +9803,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm1, %zmm24 {%k2} ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm0, %zmm25 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm1, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm1, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm1, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %zmm7, %zmm0, %zmm30 ; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm1, %zmm30 {%k2} ; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm0, %zmm7 @@ -9830,14 +9831,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm1, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,10,10,10,10,0,0,11,11] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 {%k2} # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,1,0,10,10,10,10,0,0,11,11] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 {%k2} # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 {%k2} # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload @@ -9848,65 +9849,64 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm0, %zmm26 {%k2} ; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm0, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm0, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,0,11,0,11,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm0, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,1,1,1,0,0,10,10,11,11,11,11,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm27 -; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm11, %zmm27 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm28 -; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm11, %zmm28 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm29 -; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm11, %zmm29 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm31 -; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm11, %zmm31 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm11, %zmm8 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,0,1,0,1,0,0,0,10,10,11,11,11,11,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm27 +; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm14, %zmm27 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm28 +; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm14, %zmm28 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm29 +; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm14, %zmm29 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm31 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm14, %zmm31 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vpermd %zmm10, %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm11, %zmm18 -; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm14, %zmm18 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm14, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} @@ -9916,16 +9916,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) @@ -9937,8 +9937,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 960(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $2376, %rsp # imm = 0x948 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 9a6d8c3366d98..3b609fb344362 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4353,8 +4353,8 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-SLOW-NEXT: retq ; @@ -4367,8 +4367,8 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4388,8 +4388,8 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -5099,8 +5099,8 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-SLOW-NEXT: retq ; @@ -5113,8 +5113,8 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5134,8 +5134,8 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6153,9 +6153,9 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; ; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq ; @@ -6177,9 +6177,9 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6335,9 +6335,9 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; ; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq ; @@ -6360,9 +6360,9 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6519,7 +6519,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq @@ -6543,7 +6543,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; XOPAVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq @@ -6597,7 +6597,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq @@ -6622,7 +6622,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; XOPAVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 99e8cdb179c8d..d54d4f35fcf92 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -4760,14 +4760,14 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,29,0,31,0,1,0,1] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4777,9 +4777,9 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,29,0,31,0,1,0,1] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] -; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero