141 changes: 110 additions & 31 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14322,13 +14322,15 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = DAG.getBitcast(VT, InputV);
InputV = ShuffleOffset(InputV);
InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
InputV = DAG.getBitcast(VT, InputV);

// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
Expand Down Expand Up @@ -15445,6 +15447,11 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

if (Subtarget.hasSSE41())
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

if (NumV2Elements == 0) {
Expand Down Expand Up @@ -15483,6 +15490,13 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}

if (Subtarget.hasSSE2())
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
return ZExt;
}

if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
Expand All @@ -15498,10 +15512,6 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;

if (Subtarget.hasSSE41()) {
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;

// Use INSERTPS if we can complete the shuffle efficiently.
if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
Expand Down Expand Up @@ -16871,7 +16881,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// AVX vector shuffle types.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
SelectionDAG &DAG, bool SimpleOnly) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
Expand Down Expand Up @@ -16899,34 +16909,60 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
std::tie(LoV2, HiV2) = SplitVector(V2);

// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
bool &UseHiV1, bool &UseLoV2,
bool &UseHiV2) {
UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
if (M >= NumElements + SplitNumElements)
UseHiV2 = true;
else
UseLoV2 = true;
V2BlendMask[i] = M - NumElements;
BlendMask[i] = SplitNumElements + i;
} else if (M >= 0) {
if (M >= SplitNumElements)
UseHiV1 = true;
else
UseLoV1 = true;
}
}
};

auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
if (!SimpleOnly)
return true;

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

return !(UseHiV1 || UseHiV2);
};

auto HalfBlend = [&](ArrayRef<int> HalfMask) {
SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
V2BlendMask[i] = M - NumElements;
BlendMask[i] = SplitNumElements + i;
} else if (M >= 0) {
V1BlendMask[i] = M;
BlendMask[i] = i;
}
}

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

// Because the lowering happens after all combining takes place, we need to
// manually combine these blend masks as much as possible so that we create
// a minimal number of high-level vector shuffle nodes.

assert(!SimpleOnly || (!UseHiV1 && !UseHiV2) && "Shuffle won't be simple");

// First try just blending the halves of V1 or V2.
if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
return DAG.getUNDEF(SplitVT);
Expand All @@ -16937,8 +16973,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

SDValue V1Blend, V2Blend;
if (UseLoV1 && UseHiV1) {
V1Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
} else {
// We only use half of V1 so map the usage down into the final blend mask.
V1Blend = UseLoV1 ? LoV1 : HiV1;
Expand All @@ -16947,8 +16982,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
}
if (UseLoV2 && UseHiV2) {
V2Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
} else {
// We only use half of V2 so map the usage down into the final blend mask.
V2Blend = UseLoV2 ? LoV2 : HiV2;
Expand All @@ -16958,6 +16992,10 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
}
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
};

if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
return SDValue();

SDValue Lo = HalfBlend(LoMask);
SDValue Hi = HalfBlend(HiMask);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
Expand Down Expand Up @@ -17014,7 +17052,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
/*SimpleOnly*/ false);

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
// requires that the decomposed single-input shuffles don't end up here.
Expand Down Expand Up @@ -17162,6 +17201,20 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
}

/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
SmallVector<int> &InLaneMask) {
int Size = Mask.size();
InLaneMask.assign(Mask.begin(), Mask.end());
for (int i = 0; i < Size; ++i) {
int &M = InLaneMask[i];
if (M < 0)
continue;
if (((M % Size) / LaneSize) != (i / LaneSize))
M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
}
}

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
/// source with a lane permutation.
///
Expand Down Expand Up @@ -17206,21 +17259,17 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");

SmallVector<int, 32> InLaneMask(Mask);
for (int i = 0; i < Size; ++i) {
int &M = InLaneMask[i];
if (M < 0)
continue;
if (((M % Size) / LaneSize) != (i / LaneSize))
M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
}
SmallVector<int> InLaneMask;
computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected");

// If we're not using both lanes in each lane and the inlane mask is not
// repeating, then we're better off splitting.
if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
/*SimpleOnly*/ false);

// Flip the lanes, and shuffle the results which should now be in-lane.
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
Expand Down Expand Up @@ -18355,6 +18404,19 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Broadcast;

if (!Subtarget.hasAVX2()) {
SmallVector<int> InLaneMask;
computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
/*SimpleOnly*/ true))
return R;
}
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return DAG.getBitcast(MVT::v8f32, ZExt);

// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
Expand Down Expand Up @@ -18847,7 +18909,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
Expand Down Expand Up @@ -19082,6 +19144,14 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return DAG.getBitcast(MVT::v16f32, ZExt);

// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
Expand Down Expand Up @@ -19399,7 +19469,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}

/// High-level routine to lower various 512-bit x86 vector shuffles.
Expand Down Expand Up @@ -19444,7 +19514,7 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}

if (VT == MVT::v32f16) {
Expand Down Expand Up @@ -57190,9 +57260,18 @@ X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
EVT OpVT = SETCC0->getOperand(0).getValueType();
if (!VT.isInteger())
return AndOrSETCCFoldKind::None;

if (VT.isVector())
return isOperationLegal(ISD::ABS, OpVT) ? AndOrSETCCFoldKind::ABS
: AndOrSETCCFoldKind::None;
return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
(isOperationLegal(ISD::ABS, OpVT)
? AndOrSETCCFoldKind::ABS
: AndOrSETCCFoldKind::None));

// Don't use `NotAnd` as even though `not` is generally shorter code size than
// `add`, `add` can lower to LEA which can save moves / spills. Any case where
// `NotAnd` applies, `AddAnd` does as well.
// TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
// if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
return AndOrSETCCFoldKind::AddAnd;
}

Expand Down
40 changes: 40 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1468,6 +1468,46 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
}
}

// (umax X, (xor X, Pow2))
// -> (or X, Pow2)
// (umin X, (xor X, Pow2))
// -> (and X, ~Pow2)
// (smax X, (xor X, Pos_Pow2))
// -> (or X, Pos_Pow2)
// (smin X, (xor X, Pos_Pow2))
// -> (and X, ~Pos_Pow2)
// (smax X, (xor X, Neg_Pow2))
// -> (and X, ~Neg_Pow2)
// (smin X, (xor X, Neg_Pow2))
// -> (or X, Neg_Pow2)
if ((match(I0, m_c_Xor(m_Specific(I1), m_Value(X))) ||
match(I1, m_c_Xor(m_Specific(I0), m_Value(X)))) &&
isKnownToBeAPowerOfTwo(X, /* OrZero */ true)) {
bool UseOr = IID == Intrinsic::smax || IID == Intrinsic::umax;
bool UseAndN = IID == Intrinsic::smin || IID == Intrinsic::umin;

if (IID == Intrinsic::smax || IID == Intrinsic::smin) {
auto KnownSign = getKnownSign(X, II, DL, &AC, &DT);
if (KnownSign == std::nullopt) {
UseOr = false;
UseAndN = false;
} else if (*KnownSign /* true is Signed. */) {
UseOr ^= true;
UseAndN ^= true;
Type *Ty = I0->getType();
// Negative power of 2 must be IntMin. It's possible to be able to
// prove negative / power of 2 without actually having known bits, so
// just get the value by hand.
X = Constant::getIntegerValue(
Ty, APInt::getSignedMinValue(Ty->getScalarSizeInBits()));
}
}
if (UseOr)
return BinaryOperator::CreateOr(I0, X);
else if (UseAndN)
return BinaryOperator::CreateAnd(I0, Builder.CreateNot(X));
}

// If we can eliminate ~A and Y is free to invert:
// max ~A, Y --> ~(min A, ~Y)
//
Expand Down
25 changes: 17 additions & 8 deletions llvm/test/CodeGen/X86/avx512-insert-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s

define <16 x float> @test1(<16 x float> %x, ptr %br, float %y) nounwind {
; CHECK-LABEL: test1:
; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
; CHECK-NEXT: vbroadcastss %xmm1, %zmm1
; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15]
; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq
; KNL-LABEL: test1:
; KNL: ## %bb.0:
; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
; KNL-NEXT: movw $16384, %ax ## imm = 0x4000
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test1:
; SKX: ## %bb.0:
; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
; SKX-NEXT: movw $16384, %ax ## imm = 0x4000
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
; SKX-NEXT: retq
%rrr = load float, ptr %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
Expand Down
411 changes: 411 additions & 0 deletions llvm/test/CodeGen/X86/icmp-pow2-diff.ll

Large diffs are not rendered by default.

12 changes: 4 additions & 8 deletions llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -305,19 +305,15 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(ptr %ptr) nounwind uwtable
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; ALL: # %bb.0:
; ALL-NEXT: vmovups (%rdi), %zmm1
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
; X86-AVX512F: # %bb.0:
; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512F-NEXT: vmovups (%eax), %zmm1
; X86-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; X86-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
; X86-AVX512F-NEXT: retl
%ptr3 = getelementptr inbounds float, ptr %ptr, i64 3
%ptrC = getelementptr inbounds float, ptr %ptr, i64 12
Expand Down
7 changes: 2 additions & 5 deletions llvm/test/CodeGen/X86/pr43866.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,9 @@ define dso_local void @test() {
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,0],xmm0[1,0]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4]
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/setcc-logic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -456,8 +456,8 @@ define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) nounwind {
; CHECK-LABEL: ne_neg1_and_ne_zero:
; CHECK: # %bb.0:
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: cmpq $2, %rdi
; CHECK-NEXT: setae %al
; CHECK-NEXT: testq $-2, %rdi
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
%cmp1 = icmp ne i64 %x, -1
%cmp2 = icmp ne i64 %x, 0
Expand Down
242 changes: 242 additions & 0 deletions llvm/test/CodeGen/X86/shuffle-as-shifts.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-ICX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-V4
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4


define <4 x i32> @shuf_rot_v4i32_1032(<4 x i32> %x) {
; CHECK-LABEL: shuf_rot_v4i32_1032:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
; CHECK-NEXT: retq
%x1 = add <4 x i32> %x, %x
%r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
ret <4 x i32> %r
}

define <8 x i32> @shuf_rot_v8i32_10325476(<8 x i32> %x) {
; CHECK-LABEL: shuf_rot_v8i32_10325476:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; CHECK-NEXT: retq
%x1 = add <8 x i32> %x, %x
%r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i32> %r
}

define <16 x i32> @shuf_rot_v16i32_1032547698111013121514(<16 x i32> %x) {
; CHECK-LABEL: shuf_rot_v16i32_1032547698111013121514:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-NEXT: retq
%x1 = add <16 x i32> %x, %x
%r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
ret <16 x i32> %r
}

define <8 x i16> @shuf_rot_v8i16_10325476(<8 x i16> %x) {
; CHECK-LABEL: shuf_rot_v8i16_10325476:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vprold $16, %xmm0, %xmm0
; CHECK-NEXT: retq
%x1 = add <8 x i16> %x, %x
%r = shufflevector <8 x i16> %x1, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i16> %r
}

define <16 x i16> @shuf_rot_v16i16_1032547698111013121514(<16 x i16> %x) {
; CHECK-LABEL: shuf_rot_v16i16_1032547698111013121514:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vprold $16, %ymm0, %ymm0
; CHECK-NEXT: retq
%x1 = add <16 x i16> %x, %x
%r = shufflevector <16 x i16> %x1, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
ret <16 x i16> %r
}

define <32 x i16> @shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128(<32 x i16> %x) {
; CHECK-LABEL: shuf_rot_v32i16_1234056749101181314151217181916212223202527272429303128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vprolq $48, %zmm0, %zmm0
; CHECK-NEXT: retq
%x1 = add <32 x i16> %x, %x
%r = shufflevector <32 x i16> %x1, <32 x i16> zeroinitializer, <32 x i32> <i32 1,i32 2,i32 3,i32 0,i32 5,i32 6,i32 7,i32 4,i32 9,i32 10,i32 11,i32 8,i32 13,i32 14,i32 15,i32 12,i32 17,i32 18,i32 19,i32 16,i32 21,i32 22,i32 23,i32 20,i32 25,i32 26,i32 27,i32 24,i32 29,i32 30,i32 31,i32 28>
ret <32 x i16> %r
}

define <16 x i8> @shuf_rot_v16i8_2301674510118914151213(<16 x i8> %x) {
; CHECK-LABEL: shuf_rot_v16i8_2301674510118914151213:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vprold $16, %xmm0, %xmm0
; CHECK-NEXT: retq
%x1 = add <16 x i8> %x, %x
%r = shufflevector <16 x i8> %x1, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
ret <16 x i8> %r
}

define <32 x i8> @shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829(<32 x i8> %x) {
; CHECK-LABEL: shuf_rot_v32i8_230167451011891415121318191617222320212627242530312829:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vprold $16, %ymm0, %ymm0
; CHECK-NEXT: retq
%x1 = add <32 x i8> %x, %x
%r = shufflevector <32 x i8> %x1, <32 x i8> zeroinitializer, <32 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13, i32 18, i32 19, i32 16, i32 17, i32 22, i32 23, i32 20, i32 21, i32 26, i32 27, i32 24, i32 25, i32 30, i32 31, i32 28, i32 29>
ret <32 x i8> %r
}

define <64 x i8> @shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162(<64 x i8> %x) {
; CHECK-LABEL: shuf_rot_v64i8_3012745611891015121314191617182320212227242526312829303532333439363738434041424744454651484950555253545956575863606162:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddb %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vprold $8, %zmm0, %zmm0
; CHECK-NEXT: retq
%x1 = add <64 x i8> %x, %x
%r = shufflevector <64 x i8> %x1, <64 x i8> zeroinitializer, <64 x i32> <i32 3,i32 0,i32 1,i32 2,i32 7,i32 4,i32 5,i32 6,i32 11,i32 8,i32 9,i32 10,i32 15,i32 12,i32 13,i32 14,i32 19,i32 16,i32 17,i32 18,i32 23,i32 20,i32 21,i32 22,i32 27,i32 24,i32 25,i32 26,i32 31,i32 28,i32 29,i32 30,i32 35,i32 32,i32 33,i32 34,i32 39,i32 36,i32 37,i32 38,i32 43,i32 40,i32 41,i32 42,i32 47,i32 44,i32 45,i32 46,i32 51,i32 48,i32 49,i32 50,i32 55,i32 52,i32 53,i32 54,i32 59,i32 56,i32 57,i32 58,i32 63,i32 60,i32 61,i32 62>
ret <64 x i8> %r
}

define <4 x i32> @shuf_shr_v4i32_1U3U(<4 x i32> %x) {
; CHECK-LABEL: shuf_shr_v4i32_1U3U:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT: retq
%x1 = add <4 x i32> %x, %x
%r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
ret <4 x i32> %r
}

define <8 x i32> @shuf_shr_v8i32_1U3U5U7U(<8 x i32> %x) {
; CHECK-LABEL: shuf_shr_v8i32_1U3U5U7U:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; CHECK-NEXT: retq
%x1 = add <8 x i32> %x, %x
%r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef>
ret <8 x i32> %r
}

define <16 x i32> @shuf_shr_v16i32_U3U5U7U9U11U13U15(<16 x i32> %x) {
; CHECK-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: retq
%x1 = add <16 x i32> %x, %x
%r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 11, i32 undef, i32 13, i32 undef, i32 15, i32 undef>
ret <16 x i32> %r
}

define <8 x i16> @shuf_shr_v8i16_123U567U(<8 x i16> %x) {
; CHECK-LABEL: shuf_shr_v8i16_123U567U:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpsrlq $16, %xmm0, %xmm0
; CHECK-NEXT: retq
%x1 = add <8 x i16> %x, %x
%r = shufflevector <8 x i16> %x1, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 7, i32 undef>
ret <8 x i16> %r
}

define <32 x i16> @shuf_shr_v32i16_1U3U5U7U9U11U13U15U17U19U21U23U25U27U29U31U(<32 x i16> %x) {
; CHECK-LABEL: shuf_shr_v32i16_1U3U5U7U9U11U13U15U17U19U21U23U25U27U29U31U:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vpsrld $16, %zmm0, %zmm0
; CHECK-NEXT: retq
%x1 = add <32 x i16> %x, %x
%r = shufflevector <32 x i16> %x1, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 11, i32 undef, i32 13, i32 undef, i32 15, i32 undef, i32 17, i32 undef, i32 19, i32 undef, i32 21, i32 undef, i32 23, i32 undef, i32 25, i32 undef, i32 27, i32 undef, i32 29, i32 undef, i32 31, i32 undef>
ret <32 x i16> %r
}

define <32 x i8> @shuf_shr_v32i8_1U3U5U7U9U11U13U15U17U19U21U23U25U27U29U31U(<32 x i8> %x) {
; CHECK-LABEL: shuf_shr_v32i8_1U3U5U7U9U11U13U15U17U19U21U23U25U27U29U31U:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
; CHECK-NEXT: retq
%x1 = add <32 x i8> %x, %x
%r = shufflevector <32 x i8> %x1, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 11, i32 undef, i32 13, i32 undef, i32 15, i32 undef, i32 17, i32 undef, i32 19, i32 undef, i32 21, i32 undef, i32 23, i32 undef, i32 25, i32 undef, i32 27, i32 undef, i32 29, i32 undef, i32 31, i32 undef>
ret <32 x i8> %r
}

define <4 x i32> @shuf_shl_v4i32_U0U2(<4 x i32> %x) {
; CHECK-LABEL: shuf_shl_v4i32_U0U2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; CHECK-NEXT: retq
%x1 = add <4 x i32> %x, %x
%r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
ret <4 x i32> %r
}

define <8 x i32> @shuf_shl_v8i32_U0U2U4U6(<8 x i32> %x) {
; CHECK-LABEL: shuf_shl_v8i32_U0U2U4U6:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: retq
%x1 = add <8 x i32> %x, %x
%r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6>
ret <8 x i32> %r
}

define <16 x i32> @shuf_shl_v16i32_U0U2U4U6U8U10U12U14(<16 x i32> %x) {
; CHECK-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: retq
%x1 = add <16 x i32> %x, %x
%r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 undef, i32 12, i32 undef, i32 14>
ret <16 x i32> %r
}

define <16 x i16> @shuf_shl_v16i16_U0U2U4U6U8U10U12U14(<16 x i16> %x) {
; CHECK-LABEL: shuf_shl_v16i16_U0U2U4U6U8U10U12U14:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vpslld $16, %ymm0, %ymm0
; CHECK-NEXT: retq
%x1 = add <16 x i16> %x, %x
%r = shufflevector <16 x i16> %x1, <16 x i16> zeroinitializer, <16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 undef, i32 12, i32 undef, i32 14>
ret <16 x i16> %r
}

define <16 x i8> @shuf_shl_v16i8_U0U2U4U6U8U10U12U14(<16 x i8> %x) {
; CHECK-LABEL: shuf_shl_v16i8_U0U2U4U6U8U10U12U14:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
; CHECK-NEXT: retq
%x1 = add <16 x i8> %x, %x
%r = shufflevector <16 x i8> %x1, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 undef, i32 12, i32 undef, i32 14>
ret <16 x i8> %r
}

define <64 x i8> @shuf_shl_v64i8_U0U2U4U6U8U10U12U14U16U18U20U22U24U26U28U30U32U34U36U38U40U42U44U46U48U50U52U54U56U58U60U62(<64 x i8> %x) {
; CHECK-LABEL: shuf_shl_v64i8_U0U2U4U6U8U10U12U14U16U18U20U22U24U26U28U30U32U34U36U38U40U42U44U46U48U50U52U54U56U58U60U62:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddb %zmm0, %zmm0, %zmm0
; CHECK-NEXT: vpsllw $8, %zmm0, %zmm0
; CHECK-NEXT: retq
%x1 = add <64 x i8> %x, %x
%r = shufflevector <64 x i8> %x1, <64 x i8> zeroinitializer, <64 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 undef, i32 12, i32 undef, i32 14, i32 undef, i32 16, i32 undef, i32 18, i32 undef, i32 20, i32 undef, i32 22, i32 undef, i32 24, i32 undef, i32 26, i32 undef, i32 28, i32 undef, i32 30, i32 undef, i32 32, i32 undef, i32 34, i32 undef, i32 36, i32 undef, i32 38, i32 undef, i32 40, i32 undef, i32 42, i32 undef, i32 44, i32 undef, i32 46, i32 undef, i32 48, i32 undef, i32 50, i32 undef, i32 52, i32 undef, i32 54, i32 undef, i32 56, i32 undef, i32 58, i32 undef, i32 60, i32 undef, i32 62>
ret <64 x i8> %r
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-ICX: {{.*}}
; CHECK-SKX: {{.*}}
; CHECK-V4: {{.*}}
; CHECK-ZNVER4: {{.*}}
27 changes: 19 additions & 8 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -491,16 +491,27 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_091b2d3f:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_091b2d3f:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
; AVX512VL-SLOW-LABEL: shuffle_v8f32_091b2d3f:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_091b2d3f:
; AVX512VL-FAST-ALL: # %bb.0:
; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-ALL-NEXT: retq
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_091b2d3f:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x float> %shuffle
}
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/vector-shuffle-combining.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2820,17 +2820,17 @@ define <4 x float> @PR30264(<4 x float> %x) {
; SSE2-LABEL: PR30264:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR30264:
; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR30264:
Expand Down
51 changes: 20 additions & 31 deletions llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4677,20 +4677,16 @@ define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm0, (%rcx)
; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
Expand Down Expand Up @@ -7005,26 +7001,19 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, (%rcx)
; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2:
Expand Down
186 changes: 186 additions & 0 deletions llvm/test/Transforms/InstCombine/minmax-of-xor-x.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instcombine -S | FileCheck %s

declare i8 @llvm.smin.i8(i8, i8)
declare i8 @llvm.umin.i8(i8, i8)
declare i8 @llvm.smax.i8(i8, i8)
declare i8 @llvm.umax.i8(i8, i8)
declare <2 x i8> @llvm.smin.v2i8(<2 x i8>, <2 x i8>)
declare <2 x i8> @llvm.umin.v2i8(<2 x i8>, <2 x i8>)
declare <2 x i8> @llvm.smax.v2i8(<2 x i8>, <2 x i8>)
declare <2 x i8> @llvm.umax.v2i8(<2 x i8>, <2 x i8>)

declare void @llvm.assume(i1)
declare void @barrier()

define <2 x i8> @umax_xor_Cpow2(<2 x i8> %x) {
; CHECK-LABEL: @umax_xor_Cpow2(
; CHECK-NEXT: [[R:%.*]] = or <2 x i8> [[X:%.*]], <i8 -128, i8 -128>
; CHECK-NEXT: ret <2 x i8> [[R]]
;
%x_xor = xor <2 x i8> %x, <i8 128, i8 128>
%r = call <2 x i8> @llvm.umax.v2i8(<2 x i8> %x, <2 x i8> %x_xor)
ret <2 x i8> %r
}

define i8 @umin_xor_Cpow2(i8 %x) {
; CHECK-LABEL: @umin_xor_Cpow2(
; CHECK-NEXT: [[R:%.*]] = and i8 [[X:%.*]], -65
; CHECK-NEXT: ret i8 [[R]]
;
%x_xor = xor i8 %x, 64
%r = call i8 @llvm.umin.i8(i8 %x, i8 %x_xor)
ret i8 %r
}

define i8 @smax_xor_Cpow2_pos(i8 %x) {
; CHECK-LABEL: @smax_xor_Cpow2_pos(
; CHECK-NEXT: [[R:%.*]] = or i8 [[X:%.*]], 32
; CHECK-NEXT: ret i8 [[R]]
;
%x_xor = xor i8 %x, 32
%r = call i8 @llvm.smax.i8(i8 %x, i8 %x_xor)
ret i8 %r
}

define <2 x i8> @smin_xor_Cpow2_pos(<2 x i8> %x) {
; CHECK-LABEL: @smin_xor_Cpow2_pos(
; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[X:%.*]], <i8 -17, i8 -17>
; CHECK-NEXT: ret <2 x i8> [[R]]
;
%x_xor = xor <2 x i8> %x, <i8 16, i8 16>
%r = call <2 x i8> @llvm.smin.v2i8(<2 x i8> %x, <2 x i8> %x_xor)
ret <2 x i8> %r
}

define <2 x i8> @smax_xor_Cpow2_neg(<2 x i8> %x) {
; CHECK-LABEL: @smax_xor_Cpow2_neg(
; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[X:%.*]], <i8 127, i8 127>
; CHECK-NEXT: ret <2 x i8> [[R]]
;
%x_xor = xor <2 x i8> %x, <i8 128, i8 128>
%r = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %x, <2 x i8> %x_xor)
ret <2 x i8> %r
}

define i8 @smin_xor_Cpow2_neg(i8 %x) {
; CHECK-LABEL: @smin_xor_Cpow2_neg(
; CHECK-NEXT: [[R:%.*]] = or i8 [[X:%.*]], -128
; CHECK-NEXT: ret i8 [[R]]
;
%x_xor = xor i8 %x, 128
%r = call i8 @llvm.smin.i8(i8 %x, i8 %x_xor)
ret i8 %r
}

define i8 @umax_xor_pow2(i8 %x, i8 %y) {
; CHECK-LABEL: @umax_xor_pow2(
; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]]
; CHECK-NEXT: [[YP2:%.*]] = and i8 [[NY]], [[Y]]
; CHECK-NEXT: [[R:%.*]] = or i8 [[YP2]], [[X:%.*]]
; CHECK-NEXT: ret i8 [[R]]
;
%ny = sub i8 0, %y
%yp2 = and i8 %y, %ny
%x_xor = xor i8 %x, %yp2
%r = call i8 @llvm.umax.i8(i8 %x, i8 %x_xor)
ret i8 %r
}

define <2 x i8> @umin_xor_pow2(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: @umin_xor_pow2(
; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[Y:%.*]]
; CHECK-NEXT: [[YP2:%.*]] = and <2 x i8> [[NY]], [[Y]]
; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[YP2]], <i8 -1, i8 -1>
; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]]
; CHECK-NEXT: ret <2 x i8> [[R]]
;
%ny = sub <2 x i8> <i8 0, i8 0>, %y
%yp2 = and <2 x i8> %y, %ny
%x_xor = xor <2 x i8> %x, %yp2
%r = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %x, <2 x i8> %x_xor)
ret <2 x i8> %r
}

define i8 @smax_xor_pow2_unk(i8 %x, i8 %y) {
; CHECK-LABEL: @smax_xor_pow2_unk(
; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]]
; CHECK-NEXT: [[YP2:%.*]] = and i8 [[NY]], [[Y]]
; CHECK-NEXT: [[X_XOR:%.*]] = xor i8 [[YP2]], [[X:%.*]]
; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 [[X_XOR]])
; CHECK-NEXT: ret i8 [[R]]
;
%ny = sub i8 0, %y
%yp2 = and i8 %y, %ny
%x_xor = xor i8 %x, %yp2
%r = call i8 @llvm.smax.i8(i8 %x, i8 %x_xor)
ret i8 %r
}

define <2 x i8> @smin_xor_pow2_unk(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: @smin_xor_pow2_unk(
; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[Y:%.*]]
; CHECK-NEXT: [[YP2:%.*]] = and <2 x i8> [[NY]], [[Y]]
; CHECK-NEXT: [[X_XOR:%.*]] = xor <2 x i8> [[YP2]], [[X:%.*]]
; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[X]], <2 x i8> [[X_XOR]])
; CHECK-NEXT: ret <2 x i8> [[R]]
;
%ny = sub <2 x i8> <i8 0, i8 0>, %y
%yp2 = and <2 x i8> %y, %ny
%x_xor = xor <2 x i8> %x, %yp2
%r = call <2 x i8> @llvm.smin.v2i8(<2 x i8> %x, <2 x i8> %x_xor)
ret <2 x i8> %r
}

define i8 @smax_xor_pow2_neg(i8 %x, i8 %y) {
; CHECK-LABEL: @smax_xor_pow2_neg(
; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]]
; CHECK-NEXT: [[YP2:%.*]] = and i8 [[NY]], [[Y]]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[YP2]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[NEG:%.*]], label [[POS:%.*]]
; CHECK: neg:
; CHECK-NEXT: [[R:%.*]] = and i8 [[X:%.*]], 127
; CHECK-NEXT: ret i8 [[R]]
; CHECK: pos:
; CHECK-NEXT: call void @barrier()
; CHECK-NEXT: ret i8 0
;
%ny = sub i8 0, %y
%yp2 = and i8 %y, %ny
%cmp = icmp slt i8 %yp2, 0
br i1 %cmp, label %neg, label %pos
neg:
%x_xor = xor i8 %x, %yp2
%r = call i8 @llvm.smax.i8(i8 %x, i8 %x_xor)
ret i8 %r
pos:
call void @barrier()
ret i8 0
}

define i8 @smin_xor_pow2_pos(i8 %x, i8 %y) {
; CHECK-LABEL: @smin_xor_pow2_pos(
; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]]
; CHECK-NEXT: [[YP2:%.*]] = and i8 [[NY]], [[Y]]
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[YP2]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[NEG:%.*]], label [[POS:%.*]]
; CHECK: neg:
; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[YP2]], -1
; CHECK-NEXT: [[R:%.*]] = and i8 [[TMP1]], [[X:%.*]]
; CHECK-NEXT: ret i8 [[R]]
; CHECK: pos:
; CHECK-NEXT: call void @barrier()
; CHECK-NEXT: ret i8 0
;
%ny = sub i8 0, %y
%yp2 = and i8 %y, %ny
%cmp = icmp sgt i8 %yp2, 0
br i1 %cmp, label %neg, label %pos
neg:
%x_xor = xor i8 %x, %yp2
%r = call i8 @llvm.smin.i8(i8 %x, i8 %x_xor)
ret i8 %r
pos:
call void @barrier()
ret i8 0
}