diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 075a49df9da9a..cb117475dbe4c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49963,33 +49963,15 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, } return true; }; - if (User->getOpcode() == X86ISD::VBROADCAST_LOAD && - getTargetConstantFromBasePtr(Ptr)) { - // See if we are loading a constant that has also been broadcast. - APInt Undefs, UserUndefs; - SmallVector Bits, UserBits; - if (getTargetConstantBitsFromNode(SDValue(N, 0), 8, Undefs, Bits) && - getTargetConstantBitsFromNode(SDValue(User, 0), 8, UserUndefs, - UserBits)) { - UserUndefs = UserUndefs.trunc(Undefs.getBitWidth()); - UserBits.truncate(Bits.size()); - if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) { - SDValue Extract = extractSubVector( - SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits()); - Extract = DAG.getBitcast(RegVT, Extract); - return DCI.CombineTo(N, Extract, SDValue(User, 1)); - } - } - } - if (ISD::isNormalLoad(User)) { - // See if we are loading a constant that matches in the lower - // bits of a longer constant (but from a different constant pool ptr). - SDValue UserPtr = cast(User)->getBasePtr(); - const Constant *LdC = getTargetConstantFromBasePtr(Ptr); - const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); - if (LdC && UserC && UserPtr != Ptr && - LdC->getType()->getPrimitiveSizeInBits() < - UserC->getType()->getPrimitiveSizeInBits()) { + // See if we are loading a constant that matches in the lower + // bits of a longer constant (but from a different constant pool ptr). + SDValue UserPtr = cast(User)->getBasePtr(); + const Constant *LdC = getTargetConstantFromBasePtr(Ptr); + const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); + if (LdC && UserC && UserPtr != Ptr) { + unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits(); + unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits(); + if (LdSize < UserSize || !ISD::isNormalLoad(User)) { APInt Undefs, UserUndefs; SmallVector Bits, UserBits; if (getTargetConstantBitsFromNode(SDValue(N, 0), 8, Undefs, Bits) && diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 9aa90de654a44..9125f2492ebb8 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1459,17 +1459,16 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; ; AVX-64-LABEL: f8xi64_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] -; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1] ; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-64-NEXT: vpaddq %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-64-NEXT: vpaddq %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index c7fca67c75aea..78065bc73c1d3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -3842,117 +3842,108 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm13 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm14 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm15 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm15, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) @@ -8746,282 +8737,220 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512BW-LABEL: load_i16_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm18 +; AVX512BW-NEXT: movb $-64, %dil +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 -; AVX512BW-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <512 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 1bd7025307d2b..faa1642831c15 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -1617,117 +1617,108 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 ; AVX512F-NEXT: movb $-64, %dil ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm12 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28] -; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm14 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512F-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rcx) @@ -1745,117 +1736,108 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) @@ -3689,563 +3671,439 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i32_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: movb $-64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm15 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 +; AVX512F-NEXT: movb $-64, %dil +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512F-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512F-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} +; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512F-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r11) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%r11) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r10) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm13, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm18 +; AVX512BW-NEXT: movb $-64, %dil +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm13, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 @@ -8127,1227 +7985,1195 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i32_stride8_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm7 -; AVX512F-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512F-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512F-NEXT: vmovaps 1536(%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm9 -; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 -; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm7 -; AVX512BW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512BW-NEXT: vmovaps 1536(%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <512 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index 4a2c38168bdf4..ccc15a4119ba4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -520,118 +520,116 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-LABEL: load_i64_stride5_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <1,6,11,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <2,7,12,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <11,0,5,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <11,0,5,u> +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <12,1,6,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <1,6,11,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,7,12,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <11,0,5,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <11,0,5,u> +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <12,1,6,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <40 x i64>, ptr %in.vec, align 64 @@ -1183,90 +1181,90 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <11,0,5,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512F-NEXT: vzeroupper @@ -1280,90 +1278,90 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm12 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm13 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <11,0,5,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-NEXT: vzeroupper @@ -2559,397 +2557,403 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm26 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm27, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r9) -; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512F-NEXT: vmovdqa64 %zmm18, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r9) +; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm15 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm31, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm18 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm16 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm15 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) -; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) +; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <160 x i64>, ptr %in.vec, align 64 @@ -5486,292 +5490,291 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512F-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm5 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm29 = <1,6,11,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <2,7,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <11,0,5,u> ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 ; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm11 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm18 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 ; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -5779,467 +5782,467 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} ; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} ; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 384(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 448(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 448(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm27, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 384(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 320(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512F-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = <1,6,11,u> ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,7,12,u> ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <11,0,5,u> ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -6247,176 +6250,177 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <320 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 7d9c056716cee..91a70fb000dd6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -751,223 +751,221 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-LABEL: load_i64_stride7_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 -; AVX512F-NEXT: movb $24, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512F-NEXT: movb $-32, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] +; AVX512F-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512F-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512F-NEXT: movb $24, %r10b +; AVX512F-NEXT: kmovw %r10d, %k2 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512F-NEXT: movb $-32, %r10b +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] ; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm10 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%r10) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdi) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf8: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: movb $24, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 ; AVX512BW-NEXT: movb $-32, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,11,4,11] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <56 x i64>, ptr %in.vec, align 64 @@ -1724,183 +1722,185 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,7,14,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 ; AVX512F-NEXT: movb $24, %r11b ; AVX512F-NEXT: kmovw %r11d, %k2 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] ; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,5,6,9,0,5,6,9] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 ; AVX512F-NEXT: movb $-32, %r11b ; AVX512F-NEXT: kmovw %r11d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} -; AVX512F-NEXT: vpermi2q %zmm24, %zmm6, %zmm7 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm25[4,5,4,5] -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm22, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm22, %zmm19 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm24, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm21, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm18 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm21, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm25, %zmm2, %zmm23 -; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm22, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm25, %zmm19 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm20, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm21, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm25, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 ; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm5 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm30, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm4 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-NEXT: vpermi2q %zmm24, %zmm6, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm23 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm18 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm30, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm24, %zmm6, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm12, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%r10) -; AVX512F-NEXT: vmovdqa64 %zmm31, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm31, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper @@ -1910,183 +1910,185 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm7, %zmm29 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm31 ; AVX512BW-NEXT: movb $24, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm28[4,5,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm14 ; AVX512BW-NEXT: movb $-32, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm6, %zmm7 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm26[4,5,4,5] -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm24, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm24, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm18 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm22, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm22, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm25, %xmm25 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm20, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm26, %zmm23 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm23, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm19 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm4, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm22 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm22, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm23, %zmm4 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm6, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm6, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm27, %xmm27 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm26, %zmm25 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm18 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -3680,892 +3682,960 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride7_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] ; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $24, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512F-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm10 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11] +; AVX512F-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 +; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512F-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} +; AVX512F-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm14 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm28, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512F-NEXT: addq $2216, %rsp # imm = 0x8A8 +; AVX512F-NEXT: vmovaps %zmm7, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-NEXT: vmovaps %zmm8, (%rax) +; AVX512F-NEXT: vmovaps %zmm11, 64(%rax) +; AVX512F-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512BW-NEXT: subq $2760, %rsp # imm = 0xAC8 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,4,11] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 -; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 ; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm17 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm17 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm12 +; AVX512BW-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-NEXT: vmovaps %zmm9, (%rax) +; AVX512BW-NEXT: vmovaps %zmm11, 64(%rax) +; AVX512BW-NEXT: addq $2760, %rsp # imm = 0xAC8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <224 x i64>, ptr %in.vec, align 64 @@ -8015,45 +8085,46 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $7624, %rsp # imm = 0x1DC8 +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -8061,972 +8132,997 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 +; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm21 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 +; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm28 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm14 +; AVX512F-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm10 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 ; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm8 -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm6 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm31 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm4 -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm7 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 ; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 ; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $24, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,7,14,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm13[4,5,4,5],zmm29[4,5,4,5] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm17[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm13, %zmm10, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: movb $-32, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm14 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm13 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm4 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm8 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm10 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm13 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm15 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm29, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm23, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm31, 448(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 448(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm27, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 448(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 256(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 320(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 384(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 448(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 256(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 384(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovaps %zmm11, 384(%rax) +; AVX512F-NEXT: vmovaps %zmm4, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512F-NEXT: addq $7624, %rsp # imm = 0x1DC8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512BW-NEXT: subq $7624, %rsp # imm = 0x1DC8 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -9034,867 +9130,891 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm0 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm9 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm7 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm9 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm19, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,7,14,u> ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm31[4,5,4,5] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm23[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm9[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm19 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm20, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm20 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm22 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm24 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm24 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm26, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm10, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm0 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa 1408(%rdi), %ymm4 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa 2304(%rdi), %ymm6 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa 1856(%rdi), %ymm7 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm12, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm12, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm12, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm12, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm12, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9931,21 +10051,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: addq $7624, %rsp # imm = 0x1DC8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <448 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index 4738b2344255b..638f7f685319b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -815,229 +815,225 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-LABEL: load_i64_stride8_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 -; AVX512F-NEXT: movb $-64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm16 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512F-NEXT: movb $-64, %bl +; AVX512F-NEXT: kmovw %ebx, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r10) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdi) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm16 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512BW-NEXT: movb $-64, %bl +; AVX512BW-NEXT: kmovd %ebx, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm16 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdi) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm13, %zmm13 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <64 x i64>, ptr %in.vec, align 64 @@ -1926,221 +1922,222 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i64_stride8_vf16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512F-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512F-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 +; AVX512F-NEXT: vmovaps 512(%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512F-NEXT: vmovaps (%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 -; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm26 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] -; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm11 -; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2148,221 +2145,222 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride8_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512BW-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512BW-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 +; AVX512BW-NEXT: vmovaps 512(%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 +; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm18 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 -; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm25 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm26, %zmm1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm28, %zmm27 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm11 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm29, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm26, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm1 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7] +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm2 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4240,1031 +4238,1043 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512F-NEXT: subq $3208, %rsp # imm = 0xC88 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm31 ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm29 -; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm6 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm8 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm7 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm17 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm29 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm10 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm26 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512F-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm7 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] +; AVX512F-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] +; AVX512F-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm29 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512F-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 +; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm13 +; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm18 +; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm21 +; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm25 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 -; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm25 -; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512F-NEXT: vmovaps %zmm10, 64(%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-NEXT: vmovaps %zmm7, 64(%rax) +; AVX512F-NEXT: addq $3208, %rsp # imm = 0xC88 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512BW-NEXT: subq $3208, %rsp # imm = 0xC88 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm29 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm8 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm7 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm17 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm10 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm30 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm13 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm18 +; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm21 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm25 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm25 -; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512BW-NEXT: vmovaps %zmm10, 64(%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-NEXT: vmovaps %zmm7, 64(%rax) +; AVX512BW-NEXT: addq $3208, %rsp # imm = 0xC88 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -9123,42 +9133,41 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride8_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-NEXT: subq $6664, %rsp # imm = 0x1A08 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9167,62 +9176,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX512F-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] +; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] +; AVX512F-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] +; AVX512F-NEXT: vmovdqa 1600(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9233,61 +9240,64 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %ymm27 +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %ymm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %ymm28 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %ymm27 -; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-NEXT: vmovdqa 2240(%rdi), %ymm12 +; AVX512F-NEXT: vmovdqa 2176(%rdi), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX512F-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 4032(%rdi), %zmm1 @@ -9295,807 +9305,801 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 3776(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3776(%rdi), %ymm17 +; AVX512F-NEXT: vmovdqa64 3712(%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] +; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm10 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3776(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 3776(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3712(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm13 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm28 +; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm17 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %xmm19 -; AVX512F-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512F-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa 2112(%rdi), %xmm6 +; AVX512F-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm29 +; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm29 -; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm23 +; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm6 -; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 ; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm20 ; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm21 -; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm9 +; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 +; AVX512F-NEXT: vmovdqa 3584(%rdi), %xmm14 +; AVX512F-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm15, 384(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 256(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10181,61 +10185,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6600, %rsp # imm = 0x19C8 +; AVX512F-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10244,62 +10247,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10310,61 +10311,64 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm27 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm30 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %ymm28 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %ymm27 -; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm12 +; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm1 @@ -10372,807 +10376,801 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 3776(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm17 +; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm13 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm28 +; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm29 +; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %xmm19 -; AVX512BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm29 -; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm23 +; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 ; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm21 -; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 +; AVX512BW-NEXT: vmovdqa 3584(%rdi), %xmm14 +; AVX512BW-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11258,20 +11256,20 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6600, %rsp # imm = 0x19C8 +; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <512 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index 6ef479b875419..ab51f0bf9135c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -599,174 +599,170 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-LABEL: store_i64_stride6_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512F-NEXT: movb $12, %r10b ; AVX512F-NEXT: kmovw %r10d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512F-NEXT: movb $16, %r10b ; AVX512F-NEXT: kmovw %r10d, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 ; AVX512F-NEXT: movb $48, %r9b ; AVX512F-NEXT: kmovw %r9d, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,1,9,u,4,5,6,7> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,13,u,4,5,6,7> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,1,9,u,4,5,6,7> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,13,u,4,5,6,7> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -1445,20 +1441,20 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-LABEL: store_i64_stride6_vf16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 @@ -1469,136 +1465,135 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: movb $16, %r10b ; AVX512F-NEXT: kmovw %r10d, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512F-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 ; AVX512F-NEXT: movb $48, %r9b ; AVX512F-NEXT: kmovw %r9d, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,13,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,9,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 +; AVX512F-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} +; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm15 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512F-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 512(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 @@ -1609,116 +1604,115 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm22, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,13,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm26 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,9,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm15 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512BW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -3151,559 +3145,557 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i64_stride6_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512F-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm29 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm20 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm7 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512F-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512F-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512F-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 ; AVX512F-NEXT: movb $12, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512F-NEXT: movb $48, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,9,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm17 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm30 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,13,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 +; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 +; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 +; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 ; AVX512F-NEXT: movb $16, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} +; AVX512F-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm29, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512F-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-NEXT: vmovdqa64 %zmm13, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] -; AVX512BW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] -; AVX512BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm16, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 ; AVX512BW-NEXT: movb $12, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-NEXT: movb $48, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,9,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,13,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 +; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 +; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 ; AVX512BW-NEXT: movb $16, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm24 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -6680,1355 +6672,1393 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i64_stride6_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512F-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm30 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm28 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm24 +; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512F-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512F-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $12, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512F-NEXT: movb $48, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm4 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm6 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm10 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512F-NEXT: movb $16, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: movb $16, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm11 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm22 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512F-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512F-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512F-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512F-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 320(%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa 384(%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa 448(%rdx), %xmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 +; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 +; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 +; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512F-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512F-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 +; AVX512F-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512F-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 2944(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 2880(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 2624(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 2560(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 2496(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 2240(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 2176(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 2112(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1600(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm29, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 %zmm14, 2688(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 2304(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1920(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-NEXT: addq $3720, %rsp # imm = 0xE88 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm30 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm22 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm21 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512BW-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] -; AVX512BW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $12, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} ; AVX512BW-NEXT: movb $48, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512BW-NEXT: movb $16, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: movb $16, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 -; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 -; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 -; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 -; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 +; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 +; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 +; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 +; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 2944(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 2880(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 2816(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 2752(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 2624(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 2560(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 2496(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 2368(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 2176(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 2048(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 1984(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 2944(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2624(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2560(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 2496(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2240(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 2176(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 2112(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1600(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1280(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 %zmm14, 2688(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512BW-NEXT: addq $3720, %rsp # imm = 0xE88 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 0de7beea9398a..6fb5e43a2df68 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -750,123 +750,122 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm5[0],zmm6[0],zmm5[2],zmm6[2],zmm5[4],zmm6[4],zmm5[6],zmm6[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm5, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: movb $48, %cl -; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -875,122 +874,121 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm10 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm4, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movb $120, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -998,122 +996,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm8, %zmm12 ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-SLOW-NEXT: movb $96, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [15,7,15,7,15,7,15,7] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: movb $48, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 ; AVX512DQ-SLOW-NEXT: movb $56, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: movb $12, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 ; AVX512DQ-SLOW-NEXT: movb $112, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm4, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: movb $48, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -1122,121 +1118,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm9 ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 ; AVX512DQ-FAST-NEXT: movb $48, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> -; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 ; AVX512DQ-FAST-NEXT: movb $96, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQ-FAST-NEXT: movb $28, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm12 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 ; AVX512DQ-FAST-NEXT: movb $56, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-FAST-NEXT: movb $12, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 ; AVX512DQ-FAST-NEXT: movb $112, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: movb $120, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -1244,123 +1239,122 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm5[0],zmm6[0],zmm5[2],zmm6[2],zmm5[4],zmm6[4],zmm5[6],zmm6[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm5, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movb $48, %cl -; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; @@ -1369,122 +1363,121 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm10 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm4, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; @@ -1492,122 +1485,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm8, %zmm12 ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: movb $48, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 ; AVX512DQBW-SLOW-NEXT: movb $56, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQBW-SLOW-NEXT: movb $12, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 ; AVX512DQBW-SLOW-NEXT: movb $112, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm4, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $48, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -1616,121 +1607,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm9 ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 ; AVX512DQBW-FAST-NEXT: movb $48, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 ; AVX512DQBW-FAST-NEXT: movb $96, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQBW-FAST-NEXT: movb $28, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 ; AVX512DQBW-FAST-NEXT: movb $56, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQBW-FAST-NEXT: movb $12, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 ; AVX512DQBW-FAST-NEXT: movb $112, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: movb $120, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -2509,841 +2499,823 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm18, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm8, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] -; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm21, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm30, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm23[0],ymm31[0],ymm23[2],ymm31[2] +; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k3} = zmm29[2,3,2,3],zmm28[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm28[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm28 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm27, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm26, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm5 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: movb $12, %sil +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $64, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 -; AVX512F-ONLY-FAST-NEXT: movb $112, %sil +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: movb $24, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm26, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm27, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm31, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm17, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm18[0],zmm2[0],zmm18[2],zmm2[2],zmm18[4],zmm2[4],zmm18[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 -; AVX512F-ONLY-FAST-NEXT: movb $120, %sil -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 -; AVX512F-ONLY-FAST-NEXT: movb $24, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm23, %zmm18 +; AVX512F-ONLY-FAST-NEXT: movb $28, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k3} = zmm5[2,3,2,3],zmm12[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $-61, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm12, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm31, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm23, %ymm27, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm27[0],ymm23[0],ymm27[2],ymm23[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k3} = zmm23[2,3,2,3],zmm12[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 -; AVX512F-ONLY-FAST-NEXT: movb $48, %sil -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm5 +; AVX512F-ONLY-FAST-NEXT: movb $112, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: movb $120, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $28, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $6, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $64, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $56, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $6, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5} +; AVX512F-ONLY-FAST-NEXT: movb $56, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 {%k6} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k6} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $64, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: movb $96, %sil +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm15, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: movb $48, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm18 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $64, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 ; AVX512DQ-SLOW-NEXT: movb $24, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: movb $96, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %ymm23 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm20, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm20, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] ; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 -; AVX512DQ-SLOW-NEXT: movb $48, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm27, %zmm31 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm30, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm27 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: movb $120, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k6} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k6} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 832(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $64, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm20, %zmm16 +; AVX512DQ-FAST-NEXT: movb $24, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm27 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-FAST-NEXT: movb $12, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm8 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 ; AVX512DQ-FAST-NEXT: movb $112, %sil +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm16 +; AVX512DQ-FAST-NEXT: movb $48, %dil +; AVX512DQ-FAST-NEXT: kmovw %edi, %k2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm8 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 ; AVX512DQ-FAST-NEXT: movb $96, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 ; AVX512DQ-FAST-NEXT: movb $120, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 -; AVX512DQ-FAST-NEXT: movb $24, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm9, %zmm30 ; AVX512DQ-FAST-NEXT: movb $-31, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FAST-NEXT: kmovw %edi, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm18 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 ; AVX512DQ-FAST-NEXT: movb $-61, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 -; AVX512DQ-FAST-NEXT: movb $48, %sil -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 +; AVX512DQ-FAST-NEXT: kmovw %edi, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 {%k3} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQ-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm14, %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6] +; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm20, %ymm11, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k3} ; AVX512DQ-FAST-NEXT: movb $28, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: movb $64, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: movb $56, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -3351,841 +3323,823 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm18, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm8, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] -; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm21, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm30, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm23[0],ymm31[0],ymm23[2],ymm31[2] +; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k3} = zmm29[2,3,2,3],zmm28[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm28[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm27, %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm26, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm5 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $64, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm26, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm27, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm31, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm17, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm18[0],zmm2[0],zmm18[2],zmm2[2],zmm18[4],zmm2[4],zmm18[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm23, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: movb $28, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k3} = zmm5[2,3,2,3],zmm12[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $-61, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm12, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm31, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm23, %ymm27, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm27[0],ymm23[0],ymm27[2],ymm23[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k3} = zmm23[2,3,2,3],zmm12[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $28, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $6, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $64, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $56, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5} +; AVX512BW-ONLY-FAST-NEXT: movb $56, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k6} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $64, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: movb $96, %sil +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: movb $48, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm18 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $64, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: movb $96, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm13, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm20, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm20, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 -; AVX512DQBW-SLOW-NEXT: movb $48, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm27, %zmm31 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm30, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm27 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: movb $120, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k6} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $64, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: movb $24, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm22, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm27 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQBW-FAST-NEXT: movb $12, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm8 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 ; AVX512DQBW-FAST-NEXT: movb $112, %sil +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm16 +; AVX512DQBW-FAST-NEXT: movb $48, %dil +; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm8 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 ; AVX512DQBW-FAST-NEXT: movb $96, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 ; AVX512DQBW-FAST-NEXT: movb $120, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 -; AVX512DQBW-FAST-NEXT: movb $24, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm9, %zmm30 ; AVX512DQBW-FAST-NEXT: movb $-31, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm18 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 ; AVX512DQBW-FAST-NEXT: movb $-61, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 -; AVX512DQBW-FAST-NEXT: movb $48, %sil -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 +; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 {%k3} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm14, %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm20, %ymm11, %ymm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k3} ; AVX512DQBW-FAST-NEXT: movb $28, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: movb $64, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: movb $56, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq @@ -5754,3539 +5708,3517 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512F-ONLY-SLOW-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm21, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm7[0],zmm2[0],zmm7[2],zmm2[2],zmm7[4],zmm2[4],zmm7[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm18, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} -; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} +; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k3} +; AVX512F-ONLY-SLOW-NEXT: movb $56, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm21 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} +; AVX512F-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm6[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: movb $64, %cl -; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1344(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512F-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm5, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm5, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm25, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm24 ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm11, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm21, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm21, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movb $24, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm9, %zmm5 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $120, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} -; AVX512F-ONLY-FAST-NEXT: movb $24, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4} +; AVX512F-ONLY-FAST-NEXT: movb $120, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} -; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k4} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512F-ONLY-FAST-NEXT: movb $56, %cl +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} +; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} +; AVX512F-ONLY-FAST-NEXT: movb $56, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512F-ONLY-FAST-NEXT: movb $64, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-ONLY-FAST-NEXT: movb $64, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movb $8, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 896(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQ-SLOW-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm25, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[2,3,2,3],zmm15[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm22 ; AVX512DQ-SLOW-NEXT: movb $48, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm17, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: movb $120, %sil +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: movb $120, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm11 {%k4} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: movb $24, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512DQ-SLOW-NEXT: movb $-31, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: movb $112, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} +; AVX512DQ-SLOW-NEXT: movb $-31, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} +; AVX512DQ-SLOW-NEXT: movb $112, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm27 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQ-SLOW-NEXT: movb $56, %cl +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3} +; AVX512DQ-SLOW-NEXT: movb $56, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: movb $64, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQ-SLOW-NEXT: movb $64, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1} ; AVX512DQ-SLOW-NEXT: movb $8, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) -; AVX512DQ-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1536(%rax) +; AVX512DQ-SLOW-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2056, %rsp # imm = 0x808 +; AVX512DQ-FAST-NEXT: subq $2088, %rsp # imm = 0x828 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r8), %ymm15 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm0, %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm4, %ymm5, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: movb $48, %r10b +; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] ; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: movb $48, %r10b -; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: movb $14, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: movb $120, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} -; AVX512DQ-FAST-NEXT: movb $-61, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm18, %zmm16 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm7, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm19, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm19, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm29, %zmm23, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm27, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm19, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,11,u,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} -; AVX512DQ-FAST-NEXT: movb $-31, %sil +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4} +; AVX512DQ-FAST-NEXT: movb $120, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k4} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: movb $-61, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 {%k3} +; AVX512DQ-FAST-NEXT: movb $-31, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm27 {%k3} +; AVX512DQ-FAST-NEXT: movb $112, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} -; AVX512DQ-FAST-NEXT: movb $112, %sil +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $6, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} -; AVX512DQ-FAST-NEXT: movb $56, %cl +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3} +; AVX512DQ-FAST-NEXT: movb $56, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} +; AVX512DQ-FAST-NEXT: movb $64, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: movb $64, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: movb $8, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1088(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 768(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 640(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQ-FAST-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQ-FAST-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-SLOW-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm21, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm7[0],zmm2[0],zmm7[2],zmm2[2],zmm7[4],zmm2[4],zmm7[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm18, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: movb $56, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm21 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: movb $64, %cl -; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1344(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512BW-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm5, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm5, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm25, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm11, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm21, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm21, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm9, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} -; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4} +; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} -; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} +; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} +; AVX512BW-ONLY-FAST-NEXT: movb $56, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512BW-ONLY-FAST-NEXT: movb $64, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: movb $64, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $8, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 896(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm25, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[2,3,2,3],zmm15[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm22 ; AVX512DQBW-SLOW-NEXT: movb $48, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $120, %sil +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: movb $120, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm11 {%k4} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: movb $24, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512DQBW-SLOW-NEXT: movb $-31, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: movb $112, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: movb $-31, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: movb $112, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm27 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: movb $56, %cl +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3} +; AVX512DQBW-SLOW-NEXT: movb $56, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: movb $64, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX512DQBW-SLOW-NEXT: movb $64, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $8, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2056, %rsp # imm = 0x808 +; AVX512DQBW-FAST-NEXT: subq $2088, %rsp # imm = 0x828 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r8), %ymm15 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm0, %ymm5, %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm4, %ymm5, %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm30 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm4 +; AVX512DQBW-FAST-NEXT: movb $48, %r10b +; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: movb $48, %r10b -; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: movb $14, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: movb $120, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} -; AVX512DQBW-FAST-NEXT: movb $-61, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm18, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm7, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm18 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm19, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm19, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm29, %zmm23, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm27, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm19, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} -; AVX512DQBW-FAST-NEXT: movb $-31, %sil +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4} +; AVX512DQBW-FAST-NEXT: movb $120, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k4} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: movb $-61, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 {%k3} +; AVX512DQBW-FAST-NEXT: movb $-31, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm27 {%k3} +; AVX512DQBW-FAST-NEXT: movb $112, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} -; AVX512DQBW-FAST-NEXT: movb $112, %sil +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $6, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} -; AVX512DQBW-FAST-NEXT: movb $56, %cl +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3} +; AVX512DQBW-FAST-NEXT: movb $56, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} +; AVX512DQBW-FAST-NEXT: movb $64, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQBW-FAST-NEXT: movb $64, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: movb $8, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1088(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 768(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 640(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQBW-FAST-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQBW-FAST-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -12615,766 +12547,760 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 +; AVX512F-ONLY-SLOW-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm31, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm24, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm31, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r9), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm25, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: movb $24, %r10b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: movb $4, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm28, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm28, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm28, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} -; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,9,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} +; AVX512F-ONLY-SLOW-NEXT: movb $4, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm20 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} @@ -13385,64 +13311,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -13450,22 +13377,29 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -13480,88 +13414,81 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 1920(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3520(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13572,820 +13499,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 +; AVX512F-ONLY-SLOW-NEXT: addq $6248, %rsp # imm = 0x1868 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 +; AVX512F-ONLY-FAST-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b -; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b +; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm16[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm10, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 +; AVX512F-ONLY-FAST-NEXT: movb $24, %r10b +; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b -; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm20, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b +; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm23, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: movb $4, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 -; AVX512F-ONLY-FAST-NEXT: movb $24, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm14, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movb $64, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: movb $12, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512F-ONLY-FAST-NEXT: movb $4, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: movb $12, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $8, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm31 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k3} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movb $112, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-ONLY-FAST-NEXT: movb $56, %cl -; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512F-ONLY-FAST-NEXT: movb $56, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -14393,1089 +14316,1093 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 2560(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1408(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3520(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3328(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3072(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 +; AVX512F-ONLY-FAST-NEXT: addq $6120, %rsp # imm = 0x17E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 +; AVX512DQ-SLOW-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm24, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm25, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm25, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQ-SLOW-NEXT: movb $24, %r10b +; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm5, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: movb $48, %r10b +; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k4 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 -; AVX512DQ-SLOW-NEXT: movb $48, %r10b -; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k4} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm25, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm24 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm16, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm16, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm16, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm28 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: movb $6, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 +; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,9,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: movb $64, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} ; AVX512DQ-SLOW-NEXT: movb $4, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: movb $12, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: movb $24, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} -; AVX512DQ-SLOW-NEXT: movb $6, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} -; AVX512DQ-SLOW-NEXT: movb $64, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: movb $12, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-SLOW-NEXT: movb $8, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm20 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: movb $-61, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm7 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2496(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 1728(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 3520(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15486,815 +15413,809 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 +; AVX512DQ-SLOW-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512DQ-FAST-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm24[0],ymm9[2],ymm24[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%r8), %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm20, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%r8), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm24, %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm15, %ymm0, %ymm11 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: movb $48, %r10b +; AVX512DQ-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm30 +; AVX512DQ-FAST-NEXT: movb $24, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movb $48, %r10b +; AVX512DQ-FAST-NEXT: kmovw %r10d, %k4 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: movb $4, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 -; AVX512DQ-FAST-NEXT: movb $24, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm19, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm14, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 -; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: movb $64, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512DQ-FAST-NEXT: movb $4, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-FAST-NEXT: movb $12, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,9,u,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 ; AVX512DQ-FAST-NEXT: movb $8, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm21 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4} ; AVX512DQ-FAST-NEXT: movb $112, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm20 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -16302,920 +16223,913 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} ; AVX512DQ-FAST-NEXT: movb $-61, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2240(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 2112(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 1472(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 1024(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3520(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 3328(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 3072(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512DQ-FAST-NEXT: addq $6120, %rsp # imm = 0x17E8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 +; AVX512BW-ONLY-SLOW-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm31, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm24, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm31, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r9), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm25, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: movb $24, %r10b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: movb $4, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm28, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm28, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm28, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,9,u,u,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: movb $4, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm20 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} @@ -17226,64 +17140,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -17291,22 +17206,29 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -17321,88 +17243,81 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 1920(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3520(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17413,820 +17328,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 +; AVX512BW-ONLY-SLOW-NEXT: addq $6248, %rsp # imm = 0x1868 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 +; AVX512BW-ONLY-FAST-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r9), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm16[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm10, %ymm0, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm0, %ymm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm0, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: movb $24, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm20, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm23, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: movb $4, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm14, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $64, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: movb $4, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $8, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm31 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k3} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl -; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -18234,1089 +18145,1093 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 2560(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1408(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3520(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3328(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3072(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 +; AVX512BW-ONLY-FAST-NEXT: addq $6120, %rsp # imm = 0x17E8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 +; AVX512DQBW-SLOW-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm24, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm11, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r8), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm25, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm25, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQBW-SLOW-NEXT: movb $24, %r10b +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm31, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm5, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm10 +; AVX512DQBW-SLOW-NEXT: movb $48, %r10b +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 -; AVX512DQBW-SLOW-NEXT: movb $48, %r10b -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k4} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm25, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm16, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm16, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm16, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: movb $6, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,9,u,u,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: movb $64, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $4, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: movb $12, %sil +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: movb $24, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} -; AVX512DQBW-SLOW-NEXT: movb $6, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} -; AVX512DQBW-SLOW-NEXT: movb $64, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: movb $12, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 ; AVX512DQBW-SLOW-NEXT: movb $8, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $-61, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2496(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 1728(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 3520(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19327,815 +19242,809 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 +; AVX512DQBW-SLOW-NEXT: addq $6280, %rsp # imm = 0x1888 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512DQBW-FAST-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 ; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r8), %ymm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm24 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm24[0],ymm9[2],ymm24[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm9 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r8), %ymm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r8), %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r8), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm20, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r8), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm24, %ymm0, %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm15, %ymm0, %ymm11 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: movb $48, %r10b +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm30 +; AVX512DQBW-FAST-NEXT: movb $24, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm18, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: movb $48, %r10b +; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k4 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 -; AVX512DQBW-FAST-NEXT: movb $4, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 -; AVX512DQBW-FAST-NEXT: movb $24, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm19, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm14, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: movb $64, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512DQBW-FAST-NEXT: movb $4, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQBW-FAST-NEXT: movb $12, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,9,u,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $8, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm21 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4} ; AVX512DQBW-FAST-NEXT: movb $112, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm20 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -20143,155 +20052,154 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} ; AVX512DQBW-FAST-NEXT: movb $-61, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2240(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 2112(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 1472(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 1024(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3520(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3328(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 3328(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 3072(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQBW-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512DQBW-FAST-NEXT: addq $6120, %rsp # imm = 0x17E8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index 8bad8e79ae361..87b5732cc1aa3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -784,9 +784,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm8 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm11 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512F-NEXT: vmovdqa64 (%r11), %zmm1 @@ -804,53 +804,47 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm17 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm11 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 @@ -861,13 +855,13 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512F-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] @@ -878,7 +872,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -887,9 +881,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -899,9 +893,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm1 @@ -919,53 +913,47 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm17 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm11 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 @@ -976,13 +964,13 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] @@ -993,7 +981,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -1002,9 +990,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -1894,204 +1882,196 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm27 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm26 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm31 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512F-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%r11), %zmm30 +; AVX512F-NEXT: vmovdqa64 64(%r11), %zmm3 ; AVX512F-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm29 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 ; AVX512F-NEXT: movb $-64, %r8b ; AVX512F-NEXT: kmovw %r8d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 +; AVX512F-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm23, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm21 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm22 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm24 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm13, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm18, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm23, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] -; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm11 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512F-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm23 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %ymm23 +; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm24 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm25 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm26 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm27 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm28 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm30 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 704(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm29, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2101,204 +2081,196 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm31 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 ; AVX512BW-NEXT: movb $-64, %r8b ; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm22 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm24 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm13, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm11 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm23 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %ymm23 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm24 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm25 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm27 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm30 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -4142,461 +4114,451 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: subq $2312, %rsp # imm = 0x908 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm24 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-NEXT: vmovaps 192(%rdx), %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovaps 128(%rdx), %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm18 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512F-NEXT: movb $-64, %r11b ; AVX512F-NEXT: kmovw %r11d, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm30 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm14 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 ; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 ; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 +; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512F-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] -; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm20 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm16 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm20 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm13 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] -; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm21 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm14 +; AVX512F-NEXT: vmovdqa 128(%rdx), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm21 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm14 +; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm17 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512F-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512F-NEXT: vmovaps %zmm19, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1920(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1856(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 1024(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -4605,10 +4567,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm29, 512(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -4621,467 +4581,457 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: subq $2312, %rsp # imm = 0x908 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm24 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm18 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm30 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm14 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm19 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 +; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm30, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm16 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm13 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm21 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm14 +; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm21 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm14 +; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm17 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512BW-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512BW-NEXT: vmovaps %zmm19, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 1024(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5090,10 +5040,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5106,7 +5054,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -8878,1977 +8826,1895 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i64_stride8_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512F-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm30 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm11 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm31 +; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm27 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512F-NEXT: movb $-64, %r11b ; AVX512F-NEXT: kmovw %r11d, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512F-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512F-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%r10), %zmm13 +; AVX512F-NEXT: vmovdqa64 256(%rax), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm18 +; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512F-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 320(%r10), %zmm16 +; AVX512F-NEXT: vmovdqa64 320(%rax), %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 +; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm29 +; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512F-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512F-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 +; AVX512F-NEXT: vmovdqa64 384(%r10), %zmm13 +; AVX512F-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 +; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512F-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 +; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 128(%rsi), %ymm4 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512F-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512F-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512F-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512F-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm1 ; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm1 ; AVX512F-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm1 ; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512F-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512F-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512F-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512F-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512F-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512F-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512F-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512F-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 256(%rsi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 320(%rsi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 384(%rsi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %xmm26 +; AVX512F-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 3712(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 3264(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 3200(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 2688(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 2240(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 640(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 4032(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3968(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3904(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3840(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 3648(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 3584(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3520(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3456(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3392(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3328(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 3136(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 3072(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 3008(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2880(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 2624(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 2560(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2496(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2432(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 2304(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 2112(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 1984(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 1920(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 1856(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 1792(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1536(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 1472(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 1408(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 1344(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1088(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 960(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 896(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512BW-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512BW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm8 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm8 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm6 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%r10), %zmm13 +; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm18 +; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 320(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm29 +; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm29 +; AVX512BW-NEXT: vmovdqa64 384(%r10), %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm15 +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm21 +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm25 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm31, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 256(%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512BW-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512BW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512BW-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm10 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 ; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm3, %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm1 ; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm1 ; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm20, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm31, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm26, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm31, %zmm11, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm12, %zmm12 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 3712(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 3264(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 3200(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 2688(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 2240(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 2176(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 640(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 4032(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3968(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3904(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3840(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 3648(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 3584(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3520(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3456(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3392(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3328(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 3136(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 3072(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 3008(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2880(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2816(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 2624(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 2560(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2496(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2432(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2368(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 2112(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 1984(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 1920(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 1856(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 1792(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 1472(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 1408(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 1344(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1088(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 896(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64