diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ae73a32a5d9ac3..fc19800eda79c5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35436,7 +35436,6 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, DL, 256); } - MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! @@ -35449,12 +35448,9 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, unsigned PermMask = 0; PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); - - Res = CanonicalizeShuffleInput(ShuffleVT, V1); - Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, - DAG.getUNDEF(ShuffleVT), - DAG.getTargetConstant(PermMask, DL, MVT::i8)); - return DAG.getBitcast(RootVT, Res); + return DAG.getNode( + X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1), + DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8)); } if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) @@ -35470,14 +35466,12 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, unsigned PermMask = 0; PermMask |= ((BaseMask[0] & 3) << 0); PermMask |= ((BaseMask[1] & 3) << 4); - SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2; SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2; - Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, - CanonicalizeShuffleInput(ShuffleVT, LHS), - CanonicalizeShuffleInput(ShuffleVT, RHS), + return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT, + CanonicalizeShuffleInput(RootVT, LHS), + CanonicalizeShuffleInput(RootVT, RHS), DAG.getTargetConstant(PermMask, DL, MVT::i8)); - return DAG.getBitcast(RootVT, Res); } } } @@ -37323,11 +37317,26 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } case X86ISD::VPERM2X128: { + // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)). + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::BITCAST && + (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) { + EVT SrcVT = LHS.getOperand(0).getValueType(); + if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) { + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT, + DAG.getBitcast(SrcVT, LHS), + DAG.getBitcast(SrcVT, RHS), + N->getOperand(2))); + } + } + + // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()). if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL)) - return Res; + return Res; - // Combine vperm2x128 subvector shuffle with an inner concat pattern. - // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. + // Fold vperm2x128 subvector shuffle with an inner concat pattern. + // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. auto FindSubVector128 = [&](unsigned Idx) { if (Idx > 3) return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 071c638077b25b..7cf555748c46f0 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7287,16 +7287,12 @@ let ExeDomain = SSEPackedSingle in { let isCommutable = 1 in def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), - "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 timm:$src3))))]>, VEX_4V, VEX_L, - Sched<[WriteFShuffle256]>; + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, + VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), - "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), - (i8 timm:$src3)))]>, VEX_4V, VEX_L, - Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, + VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; } // Immediate transform to help with commuting. @@ -7304,23 +7300,27 @@ def Perm2XCommuteImm : SDNodeXFormgetZExtValue() ^ 0x22, SDLoc(N)); }]>; +multiclass vperm2x128_lowering { + def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), + (!cast(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; + def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), + (!cast(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; + // Pattern with load in other operand. + def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), + (!cast(InstrStr#rm) VR256:$src1, addr:$src2, + (Perm2XCommuteImm timm:$imm))>; +} + let Predicates = [HasAVX] in { -// Pattern with load in other operand. -def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), - VR256:$src1, (i8 timm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; + defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; + defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; } let Predicates = [HasAVX1Only] in { -def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>; -def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, - (loadv4i64 addr:$src2), (i8 timm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>; -// Pattern with load in other operand. -def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), - VR256:$src1, (i8 timm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; + defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; + defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; + defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; } //===----------------------------------------------------------------------===// @@ -7689,27 +7689,24 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, WriteFShuffle256, f256mem>, VEX_W; //===----------------------------------------------------------------------===// -// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks +// VPERM2I128 - Permute Integer vector Values in 128-bit chunks // let isCommutable = 1 in def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), - "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>, - VEX_4V, VEX_L; + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, + Sched<[WriteShuffle256]>, VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), - "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), - (i8 timm:$src3)))]>, + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; -let Predicates = [HasAVX2] in -def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), - VR256:$src1, (i8 timm:$imm))), - (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; - +let Predicates = [HasAVX2] in { + defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; + defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; + defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; +} //===----------------------------------------------------------------------===// // VINSERTI128 - Insert packed integer values diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index 82fd7a2699a514..c022cd4e072b85 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -587,9 +587,8 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; ; AVX2-LABEL: avx2_vphadd_d_test: ; AVX2: # %bb.0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq %vecext = extractelement <8 x i32> %A, i32 0 %vecext1 = extractelement <8 x i32> %A, i32 1 @@ -743,9 +742,8 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; ; AVX2-LABEL: avx2_vphadd_w_test: ; AVX2: # %bb.0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq %vecext = extractelement <16 x i16> %a, i32 0 %vecext1 = extractelement <16 x i16> %a, i32 1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index a4923e04c09037..6107e8c0ca5ffa 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -5138,9 +5138,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpmovmskb %ymm1, %eax ; AVX2-NEXT: notl %eax diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index e944776cea9579..98335d362f95ca 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -626,16 +626,14 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) ; XOP-LABEL: var_shuffle_v8i32_from_v4i32: ; XOP: # %bb.0: # %entry ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 @@ -1049,16 +1047,14 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi ; XOP-LABEL: var_shuffle_v8f32_from_v4f32: ; XOP: # %bb.0: # %entry ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm0, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index e6b895692cc0d9..2958f7e96cb5c1 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -717,9 +717,8 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: cmpl $-1, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index a5458d5264bbca..283939f86a9451 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -700,9 +700,8 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: testl %eax, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index efb45ee36f9fda..a60dbcd7480af4 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -756,9 +756,8 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: movl %eax, %ecx diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index ac1144818b070c..d8e4c00859707c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -5964,7 +5964,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq @@ -5988,7 +5988,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; XOPAVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq @@ -6146,7 +6146,7 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq @@ -6171,7 +6171,7 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; XOPAVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq @@ -6329,7 +6329,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq @@ -6353,7 +6353,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; XOPAVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq @@ -6407,7 +6407,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq @@ -6432,7 +6432,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; XOPAVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 8d5bd48119eb10..6a103f642c76f7 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1306,9 +1306,8 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq