diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7895f883863f1..2f9de876a87fa 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36665,6 +36665,43 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, return SDValue(); } +/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()). +static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, + SelectionDAG &DAG, + const SDLoc &DL) { + assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle"); + + MVT VT = V.getSimpleValueType(); + SDValue Src0 = peekThroughBitcasts(V.getOperand(0)); + SDValue Src1 = peekThroughBitcasts(V.getOperand(1)); + unsigned SrcOpc0 = Src0.getOpcode(); + unsigned SrcOpc1 = Src1.getOpcode(); + EVT SrcVT0 = Src0.getValueType(); + EVT SrcVT1 = Src1.getValueType(); + + // TODO: Under what circumstances should we push perm2f128 up when we have one + // active src? + if (SrcOpc0 != SrcOpc1 || SrcVT0 != SrcVT1) + return SDValue(); + + switch (SrcOpc0) { + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: + if (Src0.getOperand(1) == Src1.getOperand(1)) { + SDValue Res = DAG.getNode( + X86ISD::VPERM2X128, DL, VT, DAG.getBitcast(VT, Src0.getOperand(0)), + DAG.getBitcast(VT, Src1.getOperand(0)), V.getOperand(2)); + Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res), + Src0.getOperand(1)); + return DAG.getBitcast(VT, Res); + } + break; + } + + return SDValue(); +} + /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -37045,6 +37082,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } case X86ISD::VPERM2X128: { + if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL)) + return Res; + // If both 128-bit values were inserted into high halves of 256-bit values, // the shuffle can be reduced to a concatenation of subvectors: // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y @@ -37053,6 +37093,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SDValue Ins0 = peekThroughBitcasts(N.getOperand(0)); SDValue Ins1 = peekThroughBitcasts(N.getOperand(1)); unsigned Imm = N.getConstantOperandVal(2); + if (!(Imm == 0x31 && Ins0.getOpcode() == ISD::INSERT_SUBVECTOR && Ins1.getOpcode() == ISD::INSERT_SUBVECTOR && diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index bd8b7dd355cca..f35e315bbb0b4 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -107,11 +107,9 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: