diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index dd5c011bfe784..6284ded3be922 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -6057,11 +6057,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SDLoc dl(N); - // Build a vector with undefined for the new nodes. + // Build a vector with poison for the new nodes. EVT VT = N->getValueType(0); // Integer BUILD_VECTOR operands may be larger than the node's vector element - // type. The UNDEFs need to have the same type as the existing operands. + // type. The POISONs need to have the same type as the existing operands. EVT EltVT = N->getOperand(0).getValueType(); unsigned NumElts = VT.getVectorNumElements(); @@ -6070,7 +6070,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SmallVector NewOps(N->ops()); assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!"); - NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT)); + NewOps.append(WidenNumElts - NumElts, DAG.getPOISON(EltVT)); return DAG.getBuildVector(WidenVT, dl, NewOps); } diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 7f07ef476b8aa..1db776ea6f616 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -3537,27 +3537,22 @@ define <7 x i32> @rotl_v7i32_c(<7 x i32> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 ; CHECK-SD-NEXT: fmov s1, w4 -; CHECK-SD-NEXT: adrp x8, .LCPI108_0 -; CHECK-SD-NEXT: adrp x9, .LCPI108_1 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI108_0] -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI108_1] ; CHECK-SD-NEXT: mov v0.s[1], w1 ; CHECK-SD-NEXT: mov v1.s[1], w5 ; CHECK-SD-NEXT: mov v0.s[2], w2 ; CHECK-SD-NEXT: mov v1.s[2], w6 ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v2.4s, v1.4s, v2.4s -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-SD-NEXT: shl v4.4s, v0.4s, #3 -; CHECK-SD-NEXT: usra v4.4s, v0.4s, #29 -; CHECK-SD-NEXT: orr v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: mov w1, v4.s[1] -; CHECK-SD-NEXT: mov w2, v4.s[2] -; CHECK-SD-NEXT: mov w3, v4.s[3] -; CHECK-SD-NEXT: mov w5, v0.s[1] -; CHECK-SD-NEXT: mov w6, v0.s[2] -; CHECK-SD-NEXT: fmov w0, s4 -; CHECK-SD-NEXT: fmov w4, s0 +; CHECK-SD-NEXT: shl v3.4s, v1.4s, #3 +; CHECK-SD-NEXT: usra v3.4s, v1.4s, #29 +; CHECK-SD-NEXT: shl v2.4s, v0.4s, #3 +; CHECK-SD-NEXT: mov w5, v3.s[1] +; CHECK-SD-NEXT: mov w6, v3.s[2] +; CHECK-SD-NEXT: fmov w4, s3 +; CHECK-SD-NEXT: usra v2.4s, v0.4s, #29 +; CHECK-SD-NEXT: mov w1, v2.s[1] +; CHECK-SD-NEXT: mov w2, v2.s[2] +; CHECK-SD-NEXT: mov w3, v2.s[3] +; CHECK-SD-NEXT: fmov w0, s2 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: rotl_v7i32_c: @@ -3614,27 +3609,22 @@ define <7 x i32> @rotr_v7i32_c(<7 x i32> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 ; CHECK-SD-NEXT: fmov s1, w4 -; CHECK-SD-NEXT: adrp x8, .LCPI109_0 -; CHECK-SD-NEXT: adrp x9, .LCPI109_1 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI109_0] -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI109_1] ; CHECK-SD-NEXT: mov v0.s[1], w1 ; CHECK-SD-NEXT: mov v1.s[1], w5 ; CHECK-SD-NEXT: mov v0.s[2], w2 ; CHECK-SD-NEXT: mov v1.s[2], w6 ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v2.4s, v1.4s, v2.4s -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-SD-NEXT: shl v4.4s, v0.4s, #29 -; CHECK-SD-NEXT: usra v4.4s, v0.4s, #3 -; CHECK-SD-NEXT: orr v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: mov w1, v4.s[1] -; CHECK-SD-NEXT: mov w2, v4.s[2] -; CHECK-SD-NEXT: mov w3, v4.s[3] -; CHECK-SD-NEXT: mov w5, v0.s[1] -; CHECK-SD-NEXT: mov w6, v0.s[2] -; CHECK-SD-NEXT: fmov w0, s4 -; CHECK-SD-NEXT: fmov w4, s0 +; CHECK-SD-NEXT: shl v3.4s, v1.4s, #29 +; CHECK-SD-NEXT: usra v3.4s, v1.4s, #3 +; CHECK-SD-NEXT: shl v2.4s, v0.4s, #29 +; CHECK-SD-NEXT: mov w5, v3.s[1] +; CHECK-SD-NEXT: mov w6, v3.s[2] +; CHECK-SD-NEXT: fmov w4, s3 +; CHECK-SD-NEXT: usra v2.4s, v0.4s, #3 +; CHECK-SD-NEXT: mov w1, v2.s[1] +; CHECK-SD-NEXT: mov w2, v2.s[2] +; CHECK-SD-NEXT: mov w3, v2.s[3] +; CHECK-SD-NEXT: fmov w0, s2 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: rotr_v7i32_c: @@ -4132,36 +4122,31 @@ define <7 x i32> @fshl_v7i32_c(<7 x i32> %a, <7 x i32> %b) { ; CHECK-SD-LABEL: fshl_v7i32_c: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: fmov s2, w4 -; CHECK-SD-NEXT: ldr s1, [sp, #24] -; CHECK-SD-NEXT: fmov s3, w7 +; CHECK-SD-NEXT: fmov s1, w4 ; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: fmov s2, w7 +; CHECK-SD-NEXT: ldr s3, [sp, #24] ; CHECK-SD-NEXT: add x9, sp, #32 -; CHECK-SD-NEXT: ld1 { v1.s }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: adrp x10, .LCPI134_1 ; CHECK-SD-NEXT: mov v0.s[1], w1 -; CHECK-SD-NEXT: mov v2.s[1], w5 -; CHECK-SD-NEXT: ldr q5, [x10, :lo12:.LCPI134_1] -; CHECK-SD-NEXT: ld1 { v3.s }[1], [x8] +; CHECK-SD-NEXT: mov v1.s[1], w5 +; CHECK-SD-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-SD-NEXT: ld1 { v2.s }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #8 -; CHECK-SD-NEXT: ld1 { v1.s }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: add x9, sp, #40 +; CHECK-SD-NEXT: ld1 { v3.s }[2], [x9] ; CHECK-SD-NEXT: mov v0.s[2], w2 -; CHECK-SD-NEXT: mov v2.s[2], w6 -; CHECK-SD-NEXT: ld1 { v3.s }[2], [x8] -; CHECK-SD-NEXT: adrp x8, .LCPI134_0 -; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI134_0] -; CHECK-SD-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-SD-NEXT: mov v1.s[2], w6 +; CHECK-SD-NEXT: ld1 { v2.s }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v2.s }[3], [x8] ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: ushl v2.4s, v2.4s, v5.4s -; CHECK-SD-NEXT: orr v1.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: shl v1.4s, v1.4s, #3 +; CHECK-SD-NEXT: usra v1.4s, v3.4s, #29 ; CHECK-SD-NEXT: shl v0.4s, v0.4s, #3 ; CHECK-SD-NEXT: mov w5, v1.s[1] ; CHECK-SD-NEXT: mov w6, v1.s[2] ; CHECK-SD-NEXT: fmov w4, s1 -; CHECK-SD-NEXT: usra v0.4s, v3.4s, #29 +; CHECK-SD-NEXT: usra v0.4s, v2.4s, #29 ; CHECK-SD-NEXT: mov w1, v0.s[1] ; CHECK-SD-NEXT: mov w2, v0.s[2] ; CHECK-SD-NEXT: mov w3, v0.s[3] @@ -4225,36 +4210,31 @@ define <7 x i32> @fshr_v7i32_c(<7 x i32> %a, <7 x i32> %b) { ; CHECK-SD-LABEL: fshr_v7i32_c: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: fmov s2, w4 -; CHECK-SD-NEXT: ldr s1, [sp, #24] -; CHECK-SD-NEXT: fmov s3, w7 +; CHECK-SD-NEXT: fmov s1, w4 ; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: fmov s2, w7 +; CHECK-SD-NEXT: ldr s3, [sp, #24] ; CHECK-SD-NEXT: add x9, sp, #32 -; CHECK-SD-NEXT: ld1 { v1.s }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: adrp x10, .LCPI135_1 ; CHECK-SD-NEXT: mov v0.s[1], w1 -; CHECK-SD-NEXT: mov v2.s[1], w5 -; CHECK-SD-NEXT: ldr q5, [x10, :lo12:.LCPI135_1] -; CHECK-SD-NEXT: ld1 { v3.s }[1], [x8] +; CHECK-SD-NEXT: mov v1.s[1], w5 +; CHECK-SD-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-SD-NEXT: ld1 { v2.s }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #8 -; CHECK-SD-NEXT: ld1 { v1.s }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: add x9, sp, #40 +; CHECK-SD-NEXT: ld1 { v3.s }[2], [x9] ; CHECK-SD-NEXT: mov v0.s[2], w2 -; CHECK-SD-NEXT: mov v2.s[2], w6 -; CHECK-SD-NEXT: ld1 { v3.s }[2], [x8] -; CHECK-SD-NEXT: adrp x8, .LCPI135_0 -; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI135_0] -; CHECK-SD-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-SD-NEXT: mov v1.s[2], w6 +; CHECK-SD-NEXT: ld1 { v2.s }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v2.s }[3], [x8] ; CHECK-SD-NEXT: mov v0.s[3], w3 -; CHECK-SD-NEXT: ushl v1.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: ushl v2.4s, v2.4s, v5.4s -; CHECK-SD-NEXT: orr v1.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: shl v1.4s, v1.4s, #29 +; CHECK-SD-NEXT: usra v1.4s, v3.4s, #3 ; CHECK-SD-NEXT: shl v0.4s, v0.4s, #29 ; CHECK-SD-NEXT: mov w5, v1.s[1] ; CHECK-SD-NEXT: mov w6, v1.s[2] ; CHECK-SD-NEXT: fmov w4, s1 -; CHECK-SD-NEXT: usra v0.4s, v3.4s, #3 +; CHECK-SD-NEXT: usra v0.4s, v2.4s, #3 ; CHECK-SD-NEXT: mov w1, v0.s[1] ; CHECK-SD-NEXT: mov w2, v0.s[2] ; CHECK-SD-NEXT: mov w3, v0.s[3] diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll index b85cb3a4f191c..6fff0d9b155ef 100644 --- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll @@ -450,7 +450,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; ARM7-NEXT: .short 9 @ 0x9 ; ARM7-NEXT: .short 10 @ 0xa ; ARM7-NEXT: .short 10 @ 0xa -; ARM7-NEXT: .short 10 @ 0xa +; ARM7-NEXT: .short 0 @ 0x0 ; ARM7-NEXT: .LCPI4_4: ; ARM7-NEXT: .short 341 @ 0x155 ; ARM7-NEXT: .short 292 @ 0x124 @@ -502,7 +502,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; ARM8-NEXT: .short 9 @ 0x9 ; ARM8-NEXT: .short 10 @ 0xa ; ARM8-NEXT: .short 10 @ 0xa -; ARM8-NEXT: .short 10 @ 0xa +; ARM8-NEXT: .short 0 @ 0x0 ; ARM8-NEXT: .LCPI4_4: ; ARM8-NEXT: .short 341 @ 0x155 ; ARM8-NEXT: .short 292 @ 0x124 @@ -554,7 +554,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; NEON7-NEXT: .short 9 @ 0x9 ; NEON7-NEXT: .short 10 @ 0xa ; NEON7-NEXT: .short 10 @ 0xa -; NEON7-NEXT: .short 10 @ 0xa +; NEON7-NEXT: .short 0 @ 0x0 ; NEON7-NEXT: .LCPI4_4: ; NEON7-NEXT: .short 341 @ 0x155 ; NEON7-NEXT: .short 292 @ 0x124 @@ -606,7 +606,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; NEON8-NEXT: .short 9 @ 0x9 ; NEON8-NEXT: .short 10 @ 0xa ; NEON8-NEXT: .short 10 @ 0xa -; NEON8-NEXT: .short 10 @ 0xa +; NEON8-NEXT: .short 0 @ 0x0 ; NEON8-NEXT: .LCPI4_4: ; NEON8-NEXT: .short 341 @ 0x155 ; NEON8-NEXT: .short 292 @ 0x124 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index a2fcd7962b8b0..5567310bb2a61 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -8,25 +8,15 @@ ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) { -; RV32-LABEL: load_factor2_v3: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV32-NEXT: vle32.v v10, (a0) -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wi v8, v10, 0 -; RV32-NEXT: vnsrl.wx v9, v10, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: load_factor2_v3: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV64-NEXT: vle32.v v10, (a0) -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wx v9, v10, a0 -; RV64-NEXT: vnsrl.wi v8, v10, 0 -; RV64-NEXT: ret +; CHECK-LABEL: load_factor2_v3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v10, a0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: ret %interleaved.vec = load <6 x i32>, ptr %ptr %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index 636fdfae68438..ba9c926c57152 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -579,7 +579,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32MV-NEXT: vmv.v.x v10, a3 ; RV32MV-NEXT: srli a3, a1, 22 ; RV32MV-NEXT: or a2, a3, a2 -; RV32MV-NEXT: lui a3, 41121 +; RV32MV-NEXT: lui a3, 161 ; RV32MV-NEXT: slli a1, a1, 10 ; RV32MV-NEXT: srli a1, a1, 21 ; RV32MV-NEXT: vslide1down.vx v10, v10, a1 @@ -636,7 +636,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64MV-NEXT: lui a3, %hi(.LCPI4_0) ; RV64MV-NEXT: addi a3, a3, %lo(.LCPI4_0) ; RV64MV-NEXT: vle16.v v9, (a3) -; RV64MV-NEXT: lui a3, 41121 +; RV64MV-NEXT: lui a3, 161 ; RV64MV-NEXT: slli a2, a2, 32 ; RV64MV-NEXT: or a1, a1, a2 ; RV64MV-NEXT: andi a2, a1, 2047 diff --git a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll index a0247c29f257f..e5350409cd6ba 100644 --- a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll @@ -117,7 +117,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-NEXT: .short 9 @ 0x9 ; CHECK-NEXT: .short 10 @ 0xa ; CHECK-NEXT: .short 10 @ 0xa -; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 0 @ 0x0 ; CHECK-NEXT: .LCPI4_4: ; CHECK-NEXT: .short 341 @ 0x155 ; CHECK-NEXT: .short 292 @ 0x124 diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 81529aff39ff1..19c84d42a7ea6 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1141,8 +1141,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: subq $56, %rsp +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax @@ -1171,8 +1171,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; ; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-ONLY-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-ONLY-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-ONLY-AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 4b0f75df83a76..ac4554176c3e7 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -679,39 +679,39 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE2-NEXT: packuswb %xmm5, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: movq %xmm4, (%rsi) +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: movq %xmm4, (%rdx) +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm0, %xmm4 ; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm4, (%rsi) -; SSE2-NEXT: movq %xmm5, (%rdx) ; SSE2-NEXT: movq %xmm0, (%rcx) ; SSE2-NEXT: retq ; @@ -724,16 +724,16 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm2, %xmm3 +; SSE42-NEXT: movq %xmm3, (%rsi) ; SSE42-NEXT: movdqa %xmm1, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm2, %xmm3 +; SSE42-NEXT: movq %xmm3, (%rdx) ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm1, %xmm0 -; SSE42-NEXT: movq %xmm3, (%rsi) -; SSE42-NEXT: movq %xmm4, (%rdx) ; SSE42-NEXT: movq %xmm0, (%rcx) ; SSE42-NEXT: retq ; @@ -744,14 +744,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovq %xmm2, (%rsi) +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm2, (%rdx) ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm2, (%rsi) -; AVX1-NEXT: vmovq %xmm3, (%rdx) ; AVX1-NEXT: vmovq %xmm0, (%rcx) ; AVX1-NEXT: retq ; @@ -762,14 +762,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -778,10 +778,10 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOP-NEXT: vmovdqu (%rdi), %xmm1 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm2, (%rsi) -; XOP-NEXT: vmovq %xmm3, (%rdx) +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vmovq %xmm2, (%rdx) +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm0, (%rcx) ; XOP-NEXT: retq %wide.vec = load <24 x i8>, ptr %p, align 4 diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 759055d284d12..1a92365638814 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -138,22 +138,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1024,2048,2048,2] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2048,u,2,u] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: pslld $10, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE2-NEXT: orps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 304daab6d17a9..2e85a4e60a253 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,u,u] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,u,u] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index ae5dd18d4b663..8db54147b2fb7 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -499,11 +499,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-NEXT: psrld $28, %xmm1 ; SSE2-NEXT: psrld $27, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE2-NEXT: pslld $4, %xmm0 +; SSE2-NEXT: pslld $5, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -514,7 +512,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE41-NEXT: psrld $27, %xmm2 ; SSE41-NEXT: psrld $28, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $5, %xmm1 +; SSE41-NEXT: pslld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -523,7 +524,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1] +; AVX1-NEXT: vpslld $5, %xmm0, %xmm2 +; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -597,11 +600,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; X86-SSE2-NEXT: psrld $28, %xmm1 ; X86-SSE2-NEXT: psrld $27, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pslld $4, %xmm0 +; X86-SSE2-NEXT: pslld $5, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index 4b42b189538ac..17bbfa1208c01 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,u,u] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,u,u] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,134217728,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 2d8670a6d3f23..144e77b87f44c 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -497,42 +497,35 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE2-NEXT: psrld $4, %xmm1 ; SSE2-NEXT: psrld $5, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $4, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: pslld $28, %xmm0 -; SSE2-NEXT: pslld $27, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pslld $27, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrld $5, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrld $4, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: psrld $4, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pslld $27, %xmm1 ; SSE41-NEXT: pslld $28, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpslld $27, %xmm0, %xmm2 ; AVX1-NEXT: vpslld $28, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] @@ -606,17 +599,15 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: psrld $4, %xmm1 ; X86-SSE2-NEXT: psrld $5, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrld $4, %xmm3 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: pslld $28, %xmm0 -; X86-SSE2-NEXT: pslld $27, %xmm1 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pslld $27, %xmm2 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) ret <2 x i32> %res diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index dbb4b9f64f4b7..e0410ae0cc5cb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -84,11 +84,11 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -96,8 +96,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm1, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -105,8 +105,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -114,8 +114,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -123,17 +123,17 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride2_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rdx) ; AVX512-NEXT: retq %wide.vec = load <8 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index da902b3aed5ab..c932482f7af9d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -196,18 +196,18 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: movq %xmm1, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -217,14 +217,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm2, (%rsi) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm2, (%rdx) ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -234,14 +234,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -251,13 +251,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; @@ -267,13 +267,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; @@ -283,14 +283,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%rdx) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: retq ; @@ -300,13 +300,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: retq ; @@ -316,14 +316,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; @@ -333,13 +333,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: retq ; @@ -348,15 +348,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -365,13 +366,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -380,15 +381,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -397,13 +399,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 01aacc1e06258..d4e5d4c16a9ec 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -220,20 +220,20 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movq %xmm5, (%rsi) ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%rcx) ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) -; SSE-NEXT: movq %xmm4, (%rcx) ; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; @@ -246,23 +246,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vmovq %xmm2, (%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm4, (%rcx) -; AVX-NEXT: vmovq %xmm1, (%r8) +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq %xmm0, (%r8) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride4_vf4: @@ -274,23 +274,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm1, (%r8) +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovq %xmm0, (%r8) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i16_stride4_vf4: @@ -302,22 +302,22 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride4_vf4: @@ -329,125 +329,125 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride4_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512-NEXT: vpmovqw %ymm3, (%r8) +; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %ymm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride4_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride4_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride4_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512BW-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512BW-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512BW-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512BW-NEXT: vpmovqw %ymm3, (%r8) +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovqw %ymm0, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride4_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride4_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec15c6f55..8fb622228a26e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -288,55 +288,55 @@ define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movq %xmm1, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm7, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,0,3,4,5,6,7] +; SSE-NEXT: movq %xmm3, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movq %xmm3, (%r8) +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movq %xmm3, (%r9) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride5_vf4: @@ -349,30 +349,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm1, (%r9) +; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%r8) +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] +; AVX-NEXT: vmovq %xmm0, (%r9) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride5_vf4: @@ -385,22 +385,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rcx) +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm5, (%rcx) -; AVX2-NEXT: vmovq %xmm6, (%r8) ; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: retq ; @@ -412,22 +412,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: retq ; @@ -439,58 +439,64 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride5_vf4: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpextrw $5, %xmm0, %eax -; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $6, %xmm0, %eax -; AVX512-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512-NEXT: vmovd %r10d, %xmm4 -; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 -; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %r10d +; AVX512-NEXT: vmovd %xmm2, %r11d +; AVX512-NEXT: vpextrw $3, %xmm1, %ebx +; AVX512-NEXT: vpextrw $6, %xmm0, %ebp +; AVX512-NEXT: vpextrw $1, %xmm0, %r14d +; AVX512-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vmovd %r14d, %xmm1 +; AVX512-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%r8) +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) -; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride5_vf4: @@ -498,65 +504,71 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovd %xmm2, %eax +; AVX512-FCP-NEXT: vmovd %xmm2, %r10d +; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %r11d +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride5_vf4: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %rbx ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpextrw $5, %xmm0, %eax -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpextrw $6, %xmm0, %eax -; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512DQ-NEXT: vmovd %r10d, %xmm4 -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpextrw $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 -; AVX512DQ-NEXT: vmovd %xmm2, %eax +; AVX512DQ-NEXT: vpextrw $5, %xmm0, %r10d +; AVX512DQ-NEXT: vmovd %xmm2, %r11d +; AVX512DQ-NEXT: vpextrw $3, %xmm1, %ebx +; AVX512DQ-NEXT: vpextrw $6, %xmm0, %ebp +; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r14d +; AVX512DQ-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vmovd %r14d, %xmm1 +; AVX512DQ-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%r8) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %rbp ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4: @@ -564,29 +576,29 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %eax -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax +; AVX512DQ-FCP-NEXT: vmovd %xmm2, %r10d +; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %r11d +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: retq ; @@ -600,19 +612,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-NEXT: movl 32(%rdi), %edi ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -626,19 +639,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-FCP-NEXT: movl 32(%rdi), %edi ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -652,19 +666,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-NEXT: movl 32(%rdi), %edi ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -678,19 +693,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-FCP-NEXT: movl 32(%rdi), %edi ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index feb75b21d5c8d..dc8a9ed4a4ccc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -382,57 +382,57 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movq %xmm2, (%rsi) ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movq %xmm7, (%rcx) +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movq %xmm2, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%r9) ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm2, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -448,32 +448,32 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX-NEXT: vpsrld $16, %xmm1, %xmm5 +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpsrld $16, %xmm1, %xmm4 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm2[4,5],xmm4[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%r8) +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX-NEXT: vmovq %xmm3, (%r9) ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm7, (%r9) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -486,24 +486,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm5, (%r8) ; AVX2-NEXT: vmovq %xmm1, (%r9) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -516,23 +516,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm5, (%r8) ; AVX2-FP-NEXT: vmovq %xmm1, (%r9) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -545,23 +545,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; @@ -574,26 +574,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm0, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) -; AVX512-NEXT: vmovq %xmm1, (%r8) -; AVX512-NEXT: vmovq %xmm5, (%r9) -; AVX512-NEXT: vmovq %xmm2, (%rax) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%r8) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%r9) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -606,25 +606,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -637,26 +637,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -669,25 +669,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -697,22 +697,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -722,22 +722,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -747,22 +747,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -772,22 +772,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 038c73bd9fed2..e89248a5474c7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -418,77 +418,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3] +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movq %xmm7, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: psrlq $16, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] -; SSE-NEXT: pslld $16, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movq %xmm7, (%rcx) +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrlq $16, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm0, (%rdx) -; SSE-NEXT: movq %xmm7, (%rcx) -; SSE-NEXT: movq %xmm8, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) -; SSE-NEXT: movq %xmm10, (%rdi) -; SSE-NEXT: movq %xmm1, (%rax) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movq %xmm5, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%r9) +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movq %xmm5, (%rcx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride7_vf4: @@ -497,54 +497,54 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7] -; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX-NEXT: vpslld $16, %xmm2, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vpsrlq $16, %xmm4, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX-NEXT: vpsrlq $48, %xmm1, %xmm10 -; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpsrld $16, %xmm0, %xmm10 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vpslld $16, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vpsrlq $16, %xmm3, %xmm4 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7] +; AVX-NEXT: vpsrlq $48, %xmm1, %xmm6 +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm3, (%r10) ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm7, (%rcx) -; AVX-NEXT: vmovq %xmm8, (%r8) -; AVX-NEXT: vmovq %xmm9, (%r9) -; AVX-NEXT: vmovq %xmm4, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -552,51 +552,51 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-NEXT: vmovq %xmm5, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX2-NEXT: vmovq %xmm5, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%r8) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vmovq %xmm2, (%r10) +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vmovq %xmm1, (%rsi) -; AVX2-NEXT: vmovq %xmm6, (%rdx) -; AVX2-NEXT: vmovq %xmm3, (%rcx) -; AVX2-NEXT: vmovq %xmm4, (%r8) -; AVX2-NEXT: vmovq %xmm5, (%r9) -; AVX2-NEXT: vmovq %xmm7, (%r10) +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -605,8 +605,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -615,37 +615,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm5, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -654,8 +654,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -664,37 +664,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -708,47 +708,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-NEXT: vmovq %xmm5, (%rsi) +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vmovq %xmm5, (%rdx) ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) ; AVX512-NEXT: vmovq %xmm2, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) -; AVX512-NEXT: vmovq %xmm6, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) -; AVX512-NEXT: vmovq %xmm3, (%rax) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%r8) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovq %xmm2, (%r10) +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -756,48 +756,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -810,47 +810,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovq %xmm5, (%rsi) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX512DQ-NEXT: vmovq %xmm5, (%rdx) ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-NEXT: vmovq %xmm3, (%rax) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -858,48 +858,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -910,25 +910,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm6, (%r10) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -939,25 +939,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -968,25 +968,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -997,25 +997,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <28 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index fff21f9aad1bb..b249950eb8694 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -296,41 +296,41 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3] +; SSE-NEXT: movq %xmm6, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movq %xmm5, (%rcx) ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movq %xmm6, (%rsi) -; SSE-NEXT: movq %xmm8, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm7, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) -; SSE-NEXT: movq %xmm4, (%r11) +; SSE-NEXT: movq %xmm3, (%r11) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movq %xmm0, (%r10) -; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movq %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride8_vf4: @@ -345,28 +345,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vmovq %xmm6, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm6, (%rdx) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vmovq %xmm6, (%rcx) ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm6, (%rsi) -; AVX-NEXT: vmovq %xmm7, (%rdx) -; AVX-NEXT: vmovq %xmm8, (%rcx) -; AVX-NEXT: vmovq %xmm4, (%r8) ; AVX-NEXT: vmovq %xmm1, (%r9) -; AVX-NEXT: vmovq %xmm3, (%r11) -; AVX-NEXT: vmovq %xmm5, (%r10) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vmovq %xmm1, (%r11) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vmovq %xmm1, (%r10) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -382,28 +382,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vmovq %xmm6, (%rsi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-NEXT: vmovq %xmm6, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vmovq %xmm6, (%rcx) ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r8) ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm6, (%rsi) -; AVX2-NEXT: vmovq %xmm7, (%rdx) -; AVX2-NEXT: vmovq %xmm8, (%rcx) -; AVX2-NEXT: vmovq %xmm4, (%r8) ; AVX2-NEXT: vmovq %xmm1, (%r9) -; AVX2-NEXT: vmovq %xmm3, (%r11) -; AVX2-NEXT: vmovq %xmm5, (%r10) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-NEXT: vmovq %xmm1, (%r11) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vmovq %xmm1, (%r10) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -419,28 +419,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovq %xmm6, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm7, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm8, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FP-NEXT: vmovq %xmm1, (%r9) -; AVX2-FP-NEXT: vmovq %xmm3, (%r11) -; AVX2-FP-NEXT: vmovq %xmm5, (%r10) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovq %xmm1, (%r11) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FP-NEXT: vmovq %xmm1, (%r10) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -456,28 +456,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm8, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r10) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovq %xmm1, (%r11) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; @@ -493,25 +493,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] -; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 +; AVX512-NEXT: vmovq %xmm6, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512-NEXT: vmovq %xmm6, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-NEXT: vmovq %xmm6, (%rcx) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3] +; AVX512-NEXT: vpermt2d %xmm4, %xmm6, %xmm5 +; AVX512-NEXT: vmovq %xmm5, (%r8) ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-NEXT: vpermt2d %xmm2, %xmm9, %xmm0 -; AVX512-NEXT: vmovq %xmm6, (%rsi) -; AVX512-NEXT: vmovq %xmm7, (%rdx) -; AVX512-NEXT: vmovq %xmm8, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) ; AVX512-NEXT: vmovq %xmm1, (%r9) -; AVX512-NEXT: vmovq %xmm3, (%r11) -; AVX512-NEXT: vmovq %xmm4, (%r10) +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512-NEXT: vmovq %xmm1, (%r11) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-NEXT: vmovq %xmm1, (%r10) +; AVX512-NEXT: vpermt2d %xmm2, %xmm6, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: retq ; @@ -527,25 +527,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] -; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] -; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 +; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1] +; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7 +; AVX512-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-FCP-NEXT: vmovq %xmm7, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3] +; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm9, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512-FCP-NEXT: vmovq %xmm3, (%r10) +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6 +; AVX512-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; @@ -561,25 +561,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] -; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm6, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512DQ-NEXT: vmovq %xmm6, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-NEXT: vmovq %xmm6, (%rcx) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3] +; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm6, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm9, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm6, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm7, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm8, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-NEXT: vmovq %xmm3, (%r11) -; AVX512DQ-NEXT: vmovq %xmm4, (%r10) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512DQ-NEXT: vmovq %xmm1, (%r11) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-NEXT: vmovq %xmm1, (%r10) +; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm6, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: retq ; @@ -595,25 +595,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] -; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7 +; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3] +; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm9, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r10) +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: retq ; @@ -625,28 +625,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r11) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -658,28 +658,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r11) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -691,28 +691,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r11) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -724,28 +724,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index f2c5a91d2cca3..995d641644dfa 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -20,8 +20,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: movq %xmm1, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -29,8 +29,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm1, (%rsi) +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -38,8 +38,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -47,8 +47,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FP-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -56,8 +56,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; @@ -65,8 +65,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vmovlps %xmm1, (%rsi) +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vmovlps %xmm0, (%rdx) ; AVX512-NEXT: retq ; @@ -74,8 +74,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512-FCP-NEXT: retq ; @@ -83,8 +83,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-NEXT: retq ; @@ -92,8 +92,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-FCP-NEXT: retq ; @@ -101,8 +101,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-NEXT: vmovlps %xmm1, (%rsi) +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx) ; AVX512BW-NEXT: retq ; @@ -110,8 +110,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: retq ; @@ -119,8 +119,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: retq ; @@ -128,8 +128,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <4 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 34f23213500c1..8af9594f81480 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -21,13 +21,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -36,12 +36,12 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%rdx) ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vmovlps %xmm2, (%rsi) -; AVX-NEXT: vmovlps %xmm3, (%rdx) ; AVX-NEXT: vmovlps %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -50,13 +50,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride3_vf2: @@ -64,13 +64,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FP-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride3_vf2: @@ -78,13 +78,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride3_vf2: @@ -92,13 +92,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512-NEXT: vmovlps %xmm2, (%rsi) +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512-NEXT: vmovlps %xmm0, (%rdx) -; AVX512-NEXT: vmovlps %xmm1, (%rcx) +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rcx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride3_vf2: @@ -119,13 +119,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512DQ-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-NEXT: vmovlps %xmm2, (%rsi) +; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovlps %xmm1, (%rcx) +; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vmovlps %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2: @@ -146,13 +146,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512BW-NEXT: vmovlps %xmm2, (%rsi) +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx) -; AVX512BW-NEXT: vmovlps %xmm1, (%rcx) +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%rcx) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2: @@ -173,13 +173,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-BW-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 822d31eb45139..f7ddcfcc625b5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -22,13 +22,13 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: movq %xmm2, (%rdx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride4_vf2: @@ -36,11 +36,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vmovq %xmm2, (%rdx) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX-NEXT: retq @@ -50,11 +50,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-NEXT: retq @@ -64,11 +64,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-FP-NEXT: retq @@ -78,11 +78,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-FCP-NEXT: retq @@ -92,11 +92,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512-NEXT: retq @@ -108,9 +108,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -121,11 +121,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-NEXT: retq @@ -137,9 +137,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -150,11 +150,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512BW-NEXT: retq @@ -166,9 +166,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -179,11 +179,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-BW-NEXT: retq @@ -195,9 +195,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 4f80140bc6c1b..fea8ebdf116fa 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -24,19 +24,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rsi) ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movq %xmm1, (%r9) ; SSE-NEXT: retq ; @@ -46,16 +46,16 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vpextrq $1, %xmm5, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX-NEXT: vmovq %xmm0, (%r8) -; AVX-NEXT: vmovq %xmm1, (%r9) +; AVX-NEXT: vmovq %xmm4, (%r9) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride5_vf2: @@ -64,17 +64,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-NEXT: vmovq %xmm0, (%r8) -; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -84,17 +84,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -104,17 +104,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -123,21 +123,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm4, (%rdx) +; AVX512-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512-NEXT: vmovq %xmm0, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -146,19 +146,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -167,21 +167,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -190,19 +190,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -211,21 +211,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm4, (%rdx) +; AVX512BW-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -234,19 +234,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -255,21 +255,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -278,19 +278,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <10 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 85ed61811af53..49b131827c447 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -18,31 +18,31 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: movq %xmm1, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movq %xmm5, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movq %xmm0, (%r9) -; SSE-NEXT: movq %xmm7, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movq %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride6_vf2: @@ -53,22 +53,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3] ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rdx) +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rcx) ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; AVX-NEXT: vmovlps %xmm3, (%rsi) -; AVX-NEXT: vmovlps %xmm4, (%rdx) -; AVX-NEXT: vmovlps %xmm5, (%rcx) ; AVX-NEXT: vmovlps %xmm0, (%r8) -; AVX-NEXT: vmovlps %xmm6, (%r9) -; AVX-NEXT: vmovlps %xmm1, (%rax) +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,2,3,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: vmovlps %xmm0, (%r9) +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride6_vf2: @@ -80,22 +80,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-NEXT: vmovlps %xmm1, (%r8) -; AVX2-NEXT: vmovlps %xmm3, (%r9) +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovlps %xmm1, (%r9) +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -109,22 +109,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-FP-NEXT: vmovlps %xmm1, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovlps %xmm1, (%r9) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -138,54 +138,56 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-FCP-NEXT: vmovlps %xmm1, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovlps %xmm1, (%r9) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride6_vf2: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-NEXT: vextractps $2, %xmm1, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512-NEXT: vextractps $3, %xmm1, %r10d +; AVX512-NEXT: vextractps $3, %xmm1, %r11d +; AVX512-NEXT: vmovd %xmm2, %ebx +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovq %xmm1, (%rsi) ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-NEXT: vmovd %xmm2, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rcx) ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm2, (%r9) -; AVX512-NEXT: vmovlps %xmm5, (%rax) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vmovlps %xmm0, (%r9) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -195,56 +197,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride6_vf2: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbx ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-NEXT: vextractps $3, %xmm1, %r11d +; AVX512DQ-NEXT: vmovd %xmm2, %ebx +; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm2, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -254,56 +258,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride6_vf2: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbx ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512BW-NEXT: vextractps $3, %xmm1, %r11d +; AVX512BW-NEXT: vmovd %xmm2, %ebx +; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-NEXT: vmovd %xmm2, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rcx) ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm2, (%r9) -; AVX512BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -313,56 +319,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride6_vf2: ; AVX512DQ-BW: # %bb.0: +; AVX512DQ-BW-NEXT: pushq %rbx ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r11d +; AVX512DQ-BW-NEXT: vmovd %xmm2, %ebx +; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) +; AVX512DQ-BW-NEXT: popq %rbx ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -372,25 +380,25 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 7948141f6becd..64ddca71898b3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -18,35 +18,35 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movq %xmm1, (%rsi) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movq %xmm5, (%rcx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) -; SSE-NEXT: movq %xmm3, (%r10) -; SSE-NEXT: movq %xmm7, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movq %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride7_vf2: @@ -60,26 +60,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX-NEXT: vmovlps %xmm5, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] +; AVX-NEXT: vmovlps %xmm5, (%rdx) +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX-NEXT: vmovlps %xmm5, (%rcx) ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] -; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r8) +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX-NEXT: vmovlps %xmm2, (%r9) +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r10) ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-NEXT: vmovlps %xmm5, (%rsi) -; AVX-NEXT: vmovlps %xmm6, (%rdx) -; AVX-NEXT: vmovlps %xmm7, (%rcx) -; AVX-NEXT: vmovlps %xmm2, (%r8) -; AVX-NEXT: vmovlps %xmm3, (%r9) -; AVX-NEXT: vmovlps %xmm4, (%r10) ; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -94,27 +94,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%r8) +; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovlps %xmm2, (%r9) +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%r10) ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-NEXT: vmovlps %xmm3, (%r8) -; AVX2-NEXT: vmovlps %xmm4, (%r9) -; AVX2-NEXT: vmovlps %xmm7, (%r10) ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -129,27 +129,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%r8) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%r10) ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm4, (%r9) -; AVX2-FP-NEXT: vmovlps %xmm7, (%r10) ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -164,27 +164,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r8) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovlps %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -195,31 +195,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512-NEXT: vmovd %xmm1, %r11d -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512-NEXT: vmovaps (%rdi), %ymm6 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rcx) ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512-NEXT: vmovaps (%rdi), %ymm5 -; AVX512-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm1, (%r9) -; AVX512-NEXT: vmovlps %xmm7, (%r10) -; AVX512-NEXT: vmovlps %xmm5, (%rax) +; AVX512-NEXT: vmovlps %xmm4, (%r9) +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%r10) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -231,24 +231,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -259,31 +259,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512DQ-NEXT: vmovd %xmm1, %r11d -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512DQ-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm6 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5 -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm7, (%r10) -; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512DQ-NEXT: vmovlps %xmm0, (%r10) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -295,24 +295,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -323,31 +323,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512BW-NEXT: vmovd %xmm1, %r11d -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512BW-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512BW-NEXT: vmovaps (%rdi), %ymm6 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovaps (%rdi), %ymm5 -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-NEXT: vmovlps %xmm7, (%r10) -; AVX512BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512BW-NEXT: vmovlps %xmm4, (%r9) +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%r10) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -359,24 +359,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -387,31 +387,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512DQ-BW-NEXT: vmovd %xmm1, %r11d -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -423,24 +423,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 13410fb5cc4b8..a118b4056b3d0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -27,22 +27,22 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm2, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) -; SSE-NEXT: movq %xmm7, (%r11) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r11) +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movq %xmm1, (%r10) -; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride8_vf2: @@ -55,26 +55,26 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX-NEXT: vmovq %xmm2, (%rcx) +; AVX-NEXT: vpextrq $1, %xmm2, (%r8) +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vmovlps %xmm2, (%r9) +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r11) +; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vmovlps %xmm2, (%r10) ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm2, (%rcx) -; AVX-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX-NEXT: vmovlps %xmm3, (%r9) -; AVX-NEXT: vmovlps %xmm6, (%r11) -; AVX-NEXT: vmovlps %xmm7, (%r10) ; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -84,30 +84,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm5, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vmovq %xmm2, (%rcx) ; AVX2-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-NEXT: vmovlps %xmm3, (%r9) -; AVX2-NEXT: vmovlps %xmm6, (%r11) +; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovlps %xmm2, (%r9) +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovlps %xmm2, (%r11) +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovlps %xmm1, (%r10) +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -117,30 +117,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) ; AVX2-FP-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) -; AVX2-FP-NEXT: vmovlps %xmm6, (%r11) +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r11) +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovlps %xmm1, (%r10) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -150,30 +150,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) ; AVX2-FCP-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) -; AVX2-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r11) +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -186,28 +186,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm5, (%r9) -; AVX512-NEXT: vmovlps %xmm6, (%r11) -; AVX512-NEXT: vmovlps %xmm4, (%r10) -; AVX512-NEXT: vmovlps %xmm1, (%rax) +; AVX512-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%r9) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%r11) +; AVX512-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovlps %xmm1, (%r10) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -219,27 +219,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -251,28 +251,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r11) +; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -284,27 +284,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -316,28 +316,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm5, (%r9) -; AVX512BW-NEXT: vmovlps %xmm6, (%r11) -; AVX512BW-NEXT: vmovlps %xmm4, (%r10) -; AVX512BW-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r11) +; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -349,27 +349,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -381,28 +381,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r11) +; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -414,27 +414,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 81fe19c4d8b56..b609299e5f757 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -280,9 +280,9 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -290,8 +290,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm1, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -299,8 +299,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -308,8 +308,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -317,8 +317,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; @@ -326,8 +326,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm0, (%rdx) ; AVX512-NEXT: retq ; @@ -335,8 +335,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512-FCP-NEXT: retq ; @@ -344,8 +344,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-NEXT: retq ; @@ -353,41 +353,41 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride2_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride2_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride2_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride2_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index d1d7cb0a34332..a238371f0acbf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -378,39 +378,39 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -421,14 +421,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovq %xmm2, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovq %xmm2, (%rdx) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -439,14 +439,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -457,14 +457,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; @@ -475,14 +475,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; @@ -493,14 +493,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: retq ; @@ -511,14 +511,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: retq ; @@ -529,14 +529,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; @@ -547,14 +547,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: retq ; @@ -565,14 +565,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: retq ; @@ -583,14 +583,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: retq ; @@ -601,14 +601,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: retq ; @@ -619,14 +619,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index abef980277ece..1dff9f4b8fa2d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -409,62 +409,62 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movq %xmm3, (%rsi) +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movq %xmm6, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride4_vf8: @@ -475,22 +475,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm4, (%rcx) -; AVX-NEXT: vmovq %xmm1, (%r8) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vmovq %xmm0, (%rcx) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX-NEXT: vmovq %xmm0, (%r8) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride4_vf8: @@ -501,22 +501,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm1, (%r8) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vmovq %xmm0, (%rcx) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vmovq %xmm0, (%r8) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf8: @@ -527,22 +527,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf8: @@ -553,125 +553,125 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512-NEXT: vpmovdb %ymm3, (%r8) +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdb %ymm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride4_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride4_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride4_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride4_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512BW-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512BW-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512BW-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512BW-NEXT: vpmovdb %ymm3, (%r8) +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %ymm0, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride4_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride4_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index ac14f55e3f0ed..5db006e5dadb3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -583,133 +583,133 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movq %xmm8, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pslld $24, %xmm7 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movq %xmm9, (%rcx) +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm8[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pslld $24, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7] +; SSE-NEXT: movq %xmm9, (%r8) +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm2, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm10, (%r8) -; SSE-NEXT: movq %xmm7, (%r9) +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movq %xmm5, (%r9) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride5_vf8: @@ -722,30 +722,30 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%r8) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) ; AVX-NEXT: vmovq %xmm0, (%r9) ; AVX-NEXT: retq ; @@ -758,26 +758,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm5, (%rcx) -; AVX2-NEXT: vmovq %xmm6, (%r8) ; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: retq ; @@ -790,26 +790,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: retq ; @@ -822,26 +822,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: retq ; @@ -854,26 +854,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vmovq %xmm3, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%r8) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm4, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) -; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) ; AVX512-NEXT: retq ; @@ -886,26 +886,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: retq ; @@ -918,26 +918,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-NEXT: retq ; @@ -950,26 +950,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: retq ; @@ -982,26 +982,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: retq ; @@ -1014,26 +1014,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: retq ; @@ -1046,26 +1046,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: retq ; @@ -1078,26 +1078,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <40 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index f87126a98eea4..763b8a67edaf7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -755,146 +755,146 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [16711935,16711935,16711935,16711935] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movq %xmm9, (%rsi) +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: movq %xmm12, (%rcx) ; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm9[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,2] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movq %xmm10, (%r8) +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movq %xmm3, (%r9) +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) -; SSE-NEXT: movq %xmm10, (%rcx) -; SSE-NEXT: movq %xmm12, (%r8) -; SSE-NEXT: movq %xmm3, (%r9) -; SSE-NEXT: movq %xmm9, (%rax) +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movq %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride6_vf8: @@ -910,42 +910,42 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX-NEXT: vmovq %xmm3, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%r8) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%r9) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm7, (%r8) -; AVX-NEXT: vmovq %xmm8, (%r9) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -959,30 +959,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vmovq %xmm4, (%rsi) ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r8) ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r9) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm2, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm3, (%r8) -; AVX2-NEXT: vmovq %xmm5, (%r9) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -997,30 +997,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FP-NEXT: vmovq %xmm5, (%r9) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1035,30 +1035,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1073,30 +1073,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vmovq %xmm4, (%rsi) ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vmovq %xmm4, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r8) ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r9) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm4, (%rsi) -; AVX512-NEXT: vmovq %xmm2, (%rdx) -; AVX512-NEXT: vmovq %xmm6, (%rcx) -; AVX512-NEXT: vmovq %xmm3, (%r8) -; AVX512-NEXT: vmovq %xmm5, (%r9) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1111,30 +1111,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1149,30 +1149,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1187,30 +1187,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -1225,30 +1225,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r8) ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm4, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm6, (%rcx) -; AVX512BW-NEXT: vmovq %xmm3, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1263,30 +1263,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1301,30 +1301,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1339,30 +1339,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 82481269022b0..09d00795e4cc9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -932,106 +932,100 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 ; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm13[0],xmm8[1,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm13 ; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: por %xmm7, %xmm15 +; SSE-NEXT: movq %xmm15, (%rsi) ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movq %xmm9, (%rdx) ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 @@ -1040,107 +1034,104 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movq %xmm8, (%rcx) ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movq %xmm4, (%r8) ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] ; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%rcx) +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm11, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] @@ -1148,12 +1139,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: movq %xmm13, (%rsi) -; SSE-NEXT: movq %xmm9, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm10, (%r9) -; SSE-NEXT: movq %xmm11, (%rdi) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -1174,52 +1159,52 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0] ; AVX-NEXT: # xmm7 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX-NEXT: vpblendvb %xmm7, %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX-NEXT: vpblendvb %xmm7, %xmm9, %xmm6, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r10) ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm8, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm7, (%r9) -; AVX-NEXT: vmovq %xmm10, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -1235,45 +1220,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r8) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r10) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm5, (%r8) -; AVX2-NEXT: vmovq %xmm6, (%r9) -; AVX2-NEXT: vmovq %xmm7, (%r10) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1290,45 +1275,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1345,45 +1330,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1400,44 +1385,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rcx) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r8) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r10) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) -; AVX512-NEXT: vmovq %xmm6, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1454,44 +1439,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r10) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1508,44 +1493,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r10) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1562,44 +1547,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r10) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -1617,48 +1602,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: movw $580, %di # imm = 0x244 -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: movw $580, %dx # imm = 0x244 +; AVX512BW-NEXT: kmovd %edx, %k1 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r8) +; AVX512BW-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r10) +; AVX512BW-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1676,48 +1661,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-FCP-NEXT: movw $580, %di # imm = 0x244 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: movw $580, %dx # imm = 0x244 +; AVX512BW-FCP-NEXT: kmovd %edx, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX512BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1735,48 +1720,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: movw $580, %di # imm = 0x244 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: movw $580, %dx # imm = 0x244 +; AVX512DQ-BW-NEXT: kmovd %edx, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-BW-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-BW-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1794,48 +1779,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: movw $580, %di # imm = 0x244 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: movw $580, %dx # imm = 0x244 +; AVX512DQ-BW-FCP-NEXT: kmovd %edx, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 6770fb6660606..deb74d2b4651f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -878,212 +878,205 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%rsi) +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: packuswb %xmm14, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,1,1,3] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movq %xmm0, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm12, %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq %xmm6, (%r9) +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm6, (%rax) +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm1, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,3,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm15, (%r8) -; SSE-NEXT: movq %xmm11, (%r9) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm9, (%rax) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq %xmm3, (%rax) -; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride8_vf8: @@ -1104,76 +1097,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r11) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r10) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm6, (%rcx) -; AVX-NEXT: vmovq %xmm7, (%r8) -; AVX-NEXT: vmovq %xmm8, (%r9) -; AVX-NEXT: vmovq %xmm9, (%r11) -; AVX-NEXT: vmovq %xmm10, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -1195,76 +1188,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rsi) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r8) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r9) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r11) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r10) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm5, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm7, (%r8) -; AVX2-NEXT: vmovq %xmm8, (%r9) -; AVX2-NEXT: vmovq %xmm9, (%r11) -; AVX2-NEXT: vmovq %xmm10, (%r10) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -1286,76 +1279,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r8) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r9) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r11) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r10) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm7, (%r8) -; AVX2-FP-NEXT: vmovq %xmm8, (%r9) -; AVX2-FP-NEXT: vmovq %xmm9, (%r11) -; AVX2-FP-NEXT: vmovq %xmm10, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -1364,54 +1357,54 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm11, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX2-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3] +; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm9 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] +; AVX2-FCP-NEXT: vmovq %xmm9, (%rcx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm11 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r11) +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1421,21 +1414,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512-NEXT: vpmovqb %zmm3, (%r8) -; AVX512-NEXT: vpmovqb %zmm4, (%r9) -; AVX512-NEXT: vpmovqb %zmm5, (%r11) -; AVX512-NEXT: vpmovqb %zmm6, (%r10) -; AVX512-NEXT: vpmovqb %zmm7, (%rax) +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r8) +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r9) +; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r11) +; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r10) +; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1445,21 +1438,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1469,21 +1462,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1493,21 +1486,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1517,21 +1510,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512BW-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512BW-NEXT: vpmovqb %zmm3, (%r8) -; AVX512BW-NEXT: vpmovqb %zmm4, (%r9) -; AVX512BW-NEXT: vpmovqb %zmm5, (%r11) -; AVX512BW-NEXT: vpmovqb %zmm6, (%r10) -; AVX512BW-NEXT: vpmovqb %zmm7, (%rax) +; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r8) +; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r9) +; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r11) +; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r10) +; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1541,21 +1534,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512BW-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512BW-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512BW-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512BW-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1565,21 +1558,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-BW-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-BW-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-BW-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-BW-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1589,21 +1582,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <64 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index c5d3297e334c7..7c1a531628eab 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1931,31 +1931,28 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v8i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1977,7 +1974,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2003,14 +2001,12 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v8i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <8 x i8> %a, ret <8 x i8> %shift @@ -2019,31 +2015,28 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v4i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2065,7 +2058,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2091,14 +2084,12 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v4i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <4 x i8> %a, ret <4 x i8> %shift @@ -2107,31 +2098,28 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2153,7 +2141,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2179,14 +2167,12 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v2i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <2 x i8> %a, ret <2 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index eb39b6a0d2227..e6eb4d70d22c9 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1617,39 +1617,34 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1671,7 +1666,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1698,12 +1694,10 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v8i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <8 x i8> %a, ret <8 x i8> %shift @@ -1713,39 +1707,34 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,256,256,256,256] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1767,7 +1756,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1794,12 +1783,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v4i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <4 x i8> %a, ret <4 x i8> %shift @@ -1809,39 +1796,34 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1863,7 +1845,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1890,12 +1872,10 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v2i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <2 x i8> %a, ret <2 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index d245bdca6ee29..ec7db86e5e05e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1478,7 +1478,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1567,7 +1568,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1656,7 +1657,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index ee9d8a55aeb3e..35e1c5a559a95 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3575,21 +3575,17 @@ define void @SpinningCube() { ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE2-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] -; SSE2-NEXT: addps %xmm0, %xmm3 -; SSE2-NEXT: movaps %xmm3, (%rax) -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: addps %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, (%rax) +; SSE2-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE2-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: addps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, (%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: SpinningCube: @@ -3598,54 +3594,43 @@ define void @SpinningCube() { ; SSSE3-NEXT: xorps %xmm0, %xmm0 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSSE3-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] -; SSSE3-NEXT: addps %xmm0, %xmm3 -; SSSE3-NEXT: movaps %xmm3, (%rax) -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: addps %xmm2, %xmm0 -; SSSE3-NEXT: movaps %xmm0, (%rax) +; SSSE3-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSSE3-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] +; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: addps %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, (%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,mem[0] -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] -; SSE41-NEXT: addps %xmm3, %xmm4 -; SSE41-NEXT: movaps %xmm4, (%rax) -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] -; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, (%rax) +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; SSE41-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 +; SSE41-NEXT: movaps %xmm1, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] +; SSE41-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: addps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, (%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovaps %xmm2, (%rax) -; AVX-NEXT: vbroadcastss (%rax), %xmm2 -; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmovaps %xmm1, (%rax) +; AVX-NEXT: vbroadcastss (%rax), %xmm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovaps %xmm0, (%rax) ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index f70145d6b21c2..0fe3edab4ac38 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -641,10 +641,10 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ;