From 786fa9ffe54b93b41d3f10a5487936c275645422 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 4 Jun 2025 14:26:05 +0100 Subject: [PATCH] [X86] splitVector - use collectConcatOps to find pre-split subvectors Don't just match ISD::CONCAT_VECTORS - this matches more closely with isFreeToSplitVector --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +- .../vector-interleaved-load-i16-stride-2.ll | 84 ++-- .../vector-interleaved-store-i16-stride-2.ll | 432 +++++++++--------- .../vector-interleaved-store-i32-stride-3.ll | 4 +- .../vector-interleaved-store-i32-stride-4.ll | 4 +- .../vector-interleaved-store-i32-stride-5.ll | 4 +- .../vector-interleaved-store-i32-stride-7.ll | 10 +- .../vector-interleaved-store-i32-stride-8.ll | 8 +- .../vector-interleaved-store-i8-stride-2.ll | 144 +++--- .../vector-interleaved-store-i8-stride-3.ll | 189 ++++---- .../zero_extend_vector_inreg_of_broadcast.ll | 184 ++++---- 11 files changed, 551 insertions(+), 524 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 24df848f87b9b..edf68964db833 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4349,13 +4349,13 @@ static std::pair splitVector(SDValue Op, SelectionDAG &DAG, assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 && "Can't split odd sized vector"); - if (Op.getOpcode() == ISD::CONCAT_VECTORS) { - assert((Op.getNumOperands() % 2) == 0 && - "Can't split odd sized vector concat"); - unsigned HalfOps = Op.getNumOperands() / 2; + SmallVector SubOps; + if (collectConcatOps(Op.getNode(), SubOps, DAG)) { + assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat"); + unsigned HalfOps = SubOps.size() / 2; EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); - SmallVector LoOps(Op->op_begin(), Op->op_begin() + HalfOps); - SmallVector HiOps(Op->op_begin() + HalfOps, Op->op_end()); + SmallVector LoOps(SubOps.begin(), SubOps.begin() + HalfOps); + SmallVector HiOps(SubOps.begin() + HalfOps, SubOps.end()); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps); return std::make_pair(Lo, Hi); diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index b3d8d05f69947..dbb4b9f64f4b7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -1105,19 +1105,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-VL-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-VL-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-VL-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-VL-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512-VL-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512-VL-NEXT: vpsrld $16, %zmm2, %zmm7 +; AVX512-VL-NEXT: vpsrld $16, %zmm0, %zmm4 +; AVX512-VL-NEXT: vpsrld $16, %zmm1, %zmm5 +; AVX512-VL-NEXT: vpsrld $16, %zmm2, %zmm6 +; AVX512-VL-NEXT: vpsrld $16, %zmm3, %zmm7 +; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rsi) ; AVX512-VL-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512-VL-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512-VL-NEXT: vpmovdw %zmm2, 64(%rsi) ; AVX512-VL-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512-VL-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512-VL-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512-VL-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512-VL-NEXT: vpmovdw %zmm1, 32(%rdx) +; AVX512-VL-NEXT: vpmovdw %zmm2, 64(%rsi) +; AVX512-VL-NEXT: vpmovdw %zmm7, 96(%rdx) +; AVX512-VL-NEXT: vpmovdw %zmm6, 64(%rdx) +; AVX512-VL-NEXT: vpmovdw %zmm5, 32(%rdx) +; AVX512-VL-NEXT: vpmovdw %zmm4, (%rdx) ; AVX512-VL-NEXT: vzeroupper ; AVX512-VL-NEXT: retq ; @@ -1127,19 +1126,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512-FCP-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpsrld $16, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpsrld $16, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpsrld $16, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpsrld $16, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpsrld $16, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpmovdw %zmm1, 32(%rsi) ; AVX512-FCP-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512-FCP-NEXT: vpmovdw %zmm2, 64(%rsi) ; AVX512-FCP-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512-FCP-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512-FCP-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512-FCP-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512-FCP-NEXT: vpmovdw %zmm1, 32(%rdx) +; AVX512-FCP-NEXT: vpmovdw %zmm2, 64(%rsi) +; AVX512-FCP-NEXT: vpmovdw %zmm7, 96(%rdx) +; AVX512-FCP-NEXT: vpmovdw %zmm6, 64(%rdx) +; AVX512-FCP-NEXT: vpmovdw %zmm5, 32(%rdx) +; AVX512-FCP-NEXT: vpmovdw %zmm4, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1149,19 +1147,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512DQ-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpsrld $16, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm6 +; AVX512DQ-NEXT: vpsrld $16, %zmm3, %zmm7 +; AVX512DQ-NEXT: vpmovdw %zmm1, 32(%rsi) ; AVX512DQ-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512DQ-NEXT: vpmovdw %zmm2, 64(%rsi) ; AVX512DQ-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512DQ-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512DQ-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512DQ-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512DQ-NEXT: vpmovdw %zmm1, 32(%rdx) +; AVX512DQ-NEXT: vpmovdw %zmm2, 64(%rsi) +; AVX512DQ-NEXT: vpmovdw %zmm7, 96(%rdx) +; AVX512DQ-NEXT: vpmovdw %zmm6, 64(%rdx) +; AVX512DQ-NEXT: vpmovdw %zmm5, 32(%rdx) +; AVX512DQ-NEXT: vpmovdw %zmm4, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1171,19 +1168,18 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, 32(%rsi) ; AVX512DQ-FCP-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm2, 64(%rsi) ; AVX512DQ-FCP-NEXT: vpmovdw %zmm3, 96(%rsi) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm7, 64(%rdx) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm6, 96(%rdx) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovdw %zmm1, 32(%rdx) +; AVX512DQ-FCP-NEXT: vpmovdw %zmm2, 64(%rsi) +; AVX512DQ-FCP-NEXT: vpmovdw %zmm7, 96(%rdx) +; AVX512DQ-FCP-NEXT: vpmovdw %zmm6, 64(%rdx) +; AVX512DQ-FCP-NEXT: vpmovdw %zmm5, 32(%rdx) +; AVX512DQ-FCP-NEXT: vpmovdw %zmm4, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll index a034363895c0e..4a99ebecab5c8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -445,14 +445,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride2_vf16: @@ -461,14 +461,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride2_vf16: @@ -477,14 +477,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf16: @@ -493,14 +493,14 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride2_vf16: @@ -684,22 +684,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride2_vf32: @@ -712,22 +712,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride2_vf32: @@ -740,22 +740,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf32: @@ -768,22 +768,22 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride2_vf32: @@ -1075,214 +1075,214 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX512-LABEL: store_i16_stride2_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 80(%rsi), %xmm1 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-NEXT: vmovdqa 96(%rsi), %xmm5 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512-NEXT: vmovdqa 112(%rsi), %xmm6 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm7 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm9 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512-NEXT: vmovdqa %xmm9, 48(%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vmovdqa %xmm7, 16(%rdx) -; AVX512-NEXT: vmovdqa %xmm14, (%rdx) +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX512-NEXT: vmovdqa %xmm7, (%rdx) +; AVX512-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm9, 32(%rdx) +; AVX512-NEXT: vmovdqa %xmm13, 48(%rdx) +; AVX512-NEXT: vmovdqa %xmm10, 64(%rdx) +; AVX512-NEXT: vmovdqa %xmm14, 80(%rdx) ; AVX512-NEXT: vmovdqa %xmm11, 112(%rdx) -; AVX512-NEXT: vmovdqa %xmm13, 96(%rdx) -; AVX512-NEXT: vmovdqa %xmm10, 80(%rdx) -; AVX512-NEXT: vmovdqa %xmm15, 64(%rdx) -; AVX512-NEXT: vmovdqa %xmm6, 240(%rdx) -; AVX512-NEXT: vmovdqa %xmm8, 224(%rdx) -; AVX512-NEXT: vmovdqa %xmm5, 208(%rdx) -; AVX512-NEXT: vmovdqa %xmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa %xmm3, 176(%rdx) -; AVX512-NEXT: vmovdqa %xmm2, 160(%rdx) -; AVX512-NEXT: vmovdqa %xmm1, 144(%rdx) -; AVX512-NEXT: vmovdqa64 %xmm16, 128(%rdx) +; AVX512-NEXT: vmovdqa %xmm15, 96(%rdx) +; AVX512-NEXT: vmovdqa %xmm6, 192(%rdx) +; AVX512-NEXT: vmovdqa %xmm8, 208(%rdx) +; AVX512-NEXT: vmovdqa %xmm5, 240(%rdx) +; AVX512-NEXT: vmovdqa %xmm4, 224(%rdx) +; AVX512-NEXT: vmovdqa %xmm3, 128(%rdx) +; AVX512-NEXT: vmovdqa %xmm2, 144(%rdx) +; AVX512-NEXT: vmovdqa %xmm1, 176(%rdx) +; AVX512-NEXT: vmovdqa64 %xmm16, 160(%rdx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride2_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm5 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm9 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa %xmm9, 48(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm7, 16(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm14, (%rdx) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm9, 32(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm13, 48(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm10, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm14, 80(%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm11, 112(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm13, 96(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm10, 80(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm15, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm6, 240(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm8, 224(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm5, 208(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm3, 176(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm2, 160(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm1, 144(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm15, 96(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm6, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm8, 208(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm5, 240(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm4, 224(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm3, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm2, 144(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm1, 176(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %xmm16, 160(%rdx) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride2_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm5 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm6 -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm7 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm9 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512DQ-NEXT: vmovdqa %xmm9, 48(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm7, 16(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm14, (%rdx) +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX512DQ-NEXT: vmovdqa %xmm7, (%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm9, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm13, 48(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm10, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm14, 80(%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm11, 112(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm13, 96(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm10, 80(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm15, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm6, 240(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm8, 224(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm5, 208(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm3, 176(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm2, 160(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm1, 144(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %xmm16, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm15, 96(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm6, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm8, 208(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm5, 240(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm4, 224(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm3, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm2, 144(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm1, 176(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %xmm16, 160(%rdx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, 48(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 16(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, 32(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, 48(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, 80(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, 112(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, 96(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, 80(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 240(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 224(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 208(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 176(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 160(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 144(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, 96(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 208(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 240(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 224(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 144(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 176(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, 160(%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride2_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index 39230b67d380f..7303f6124afcb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -246,8 +246,8 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,1] +; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1] ; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0],xmm4[3] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll index c15eff9141fff..052dc16e7cb1f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -254,9 +254,9 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[1],xmm1[1],zero,zero ; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm3[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] ; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,0] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm0[3,0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 0fba7de803488..407b7313f05fe 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -339,9 +339,9 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovaps (%rdx), %xmm2 ; AVX-NEXT: vmovaps (%rcx), %xmm3 ; AVX-NEXT: vmovaps (%r8), %xmm4 -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm3[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] ; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,0] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 ; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm0[1],xmm1[1],zero ; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm2[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index bead2c94cf121..c34b5d2ed8c7c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -469,10 +469,10 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovaps (%r8), %xmm2 ; AVX-NEXT: vmovaps (%r9), %xmm6 ; AVX-NEXT: vmovaps (%r10), %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm7 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[2,1],ymm9[6,4],ymm7[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm2[2,1],ymm7[6,4],ymm2[6,5] ; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm3[2],xmm4[2] ; AVX-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm0[1],zero ; AVX-NEXT: vinsertps {{.*#+}} xmm10 = xmm4[1],xmm10[1,2],zero @@ -485,9 +485,9 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4],ymm9[5,6,7] -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm5[0] ; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,0] ; AVX-NEXT: vbroadcastss 4(%rdi), %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index cf246e4ede089..cac06cfa74cfc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -460,12 +460,12 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm5[3,0] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,0],xmm4[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vmovaps %ymm0, 96(%rax) ; AVX-NEXT: vmovaps %ymm10, 64(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll index 53a6d306ef84d..30be6c88514bf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll @@ -544,14 +544,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride2_vf32: @@ -560,14 +560,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride2_vf32: @@ -576,14 +576,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf32: @@ -592,14 +592,14 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride2_vf32: @@ -787,22 +787,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride2_vf64: @@ -815,22 +815,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride2_vf64: @@ -843,22 +843,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf64: @@ -871,22 +871,22 @@ define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 80(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 112(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 96(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 48(%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride2_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 8802e8a779332..782a81be47603 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -1639,119 +1639,122 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i8_stride3_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: vmovdqa (%rdi), %xmm7 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovdqa (%rdi), %xmm8 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa (%rdx), %xmm3 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] +; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rdx), %xmm5 ; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm0 +; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm4 +; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm7 ; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10] -; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128] -; AVX-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm15 -; AVX-NEXT: vpor %xmm2, %xmm15, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX-NEXT: vmovdqa 48(%rsi), %xmm15 +; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm12 +; AVX-NEXT: vpor %xmm3, %xmm12, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm15 -; AVX-NEXT: vpor %xmm6, %xmm15, %xmm0 +; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm12 +; AVX-NEXT: vpor %xmm6, %xmm12, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm15 -; AVX-NEXT: vpor %xmm9, %xmm15, %xmm0 +; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm12 +; AVX-NEXT: vpor %xmm9, %xmm12, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa (%rsi), %xmm15 -; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX-NEXT: vpor %xmm7, %xmm10, %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm2 +; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm10 +; AVX-NEXT: vpor %xmm8, %xmm10, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] -; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX-NEXT: vmovdqa %xmm1, %xmm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm5 -; AVX-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] -; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm4 -; AVX-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] -; AVX-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] -; AVX-NEXT: vpor %xmm10, %xmm8, %xmm10 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpor %xmm2, %xmm14, %xmm14 -; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] -; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm0, %xmm10 +; AVX-NEXT: vmovdqa %xmm5, %xmm0 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] +; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm6 +; AVX-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm5 +; AVX-NEXT: vmovdqa 48(%rdx), %xmm10 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] +; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4] +; AVX-NEXT: vpor %xmm7, %xmm11, %xmm11 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4] +; AVX-NEXT: vpor %xmm4, %xmm11, %xmm11 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm1, %xmm11 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpor %xmm1, %xmm14, %xmm14 -; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm0, %xmm3 +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: vpor %xmm0, %xmm14, %xmm14 -; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm15 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX-NEXT: vpor %xmm2, %xmm6, %xmm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm3 -; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm4 +; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm10 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX-NEXT: vpshufb %xmm9, %xmm12, %xmm12 +; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm0 +; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm10 +; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm12 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vmovdqa %xmm3, 64(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 80(%rcx) -; AVX-NEXT: vmovdqa %xmm14, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm4, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm12, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm5, 160(%rcx) -; AVX-NEXT: vmovdqa %xmm8, 176(%rcx) -; AVX-NEXT: vmovdqa %xmm13, 96(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 80(%rcx) +; AVX-NEXT: vmovdqa %xmm4, 64(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm10, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm7, 176(%rcx) +; AVX-NEXT: vmovdqa %xmm8, 160(%rcx) +; AVX-NEXT: vmovdqa %xmm11, 96(%rcx) ; AVX-NEXT: vmovdqa %xmm6, 112(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 128(%rcx) -; AVX-NEXT: vmovdqa %xmm10, 144(%rcx) -; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: vmovdqa %xmm1, 128(%rcx) +; AVX-NEXT: vmovdqa %xmm12, 144(%rcx) +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index bc83cc1cab42d..7ad9fb0c27170 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -3595,41 +3595,73 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW-SLOW: # %bb.0: @@ -3757,10 +3789,10 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -3796,10 +3828,10 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -3987,11 +4019,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4006,29 +4037,27 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4043,19 +4072,18 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -4188,10 +4216,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4227,10 +4255,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2