Skip to content

Commit

Permalink
[VectorCombine] fold extract-extract-op with different extraction ind…
Browse files Browse the repository at this point in the history
…exes

opcode (extelt V0, Ext0), (ext V1, Ext1) --> extelt (opcode (splat V0, Ext0), V1), Ext1

The first part of this patch generalizes the cost calculation to accept
different extraction indexes. The second part creates a shuffle+extract
before feeding into the existing code to create a vector op+extract.

The patch conservatively uses "TargetTransformInfo::SK_PermuteSingleSrc"
rather than "TargetTransformInfo::SK_Broadcast" (splat specifically
from element 0) because we do not have a more general "SK_Splat"
currently. That does not affect any of the current regression tests,
but we might be able to find some cost model target specialization where
that comes into play.

I suspect that we can expose some missing x86 horizontal op codegen with
this transform, so I'm speculatively adding a debug flag to disable the
binop variant of this transform to allow easier testing.

The test changes show that we're sensitive to cost model diffs (as we
should be), so that means that patches like D74976
should have better coverage.

Differential Revision: https://reviews.llvm.org/D75689
  • Loading branch information
rotateright committed Mar 8, 2020
1 parent b827a95 commit a69158c
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 74 deletions.
119 changes: 93 additions & 26 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Expand Up @@ -38,19 +38,23 @@ static cl::opt<bool> DisableVectorCombine(
"disable-vector-combine", cl::init(false), cl::Hidden,
cl::desc("Disable all vector combine transforms"));

/// Compare the relative costs of extracts followed by scalar operation vs.
/// vector operation followed by extract:
/// opcode (extelt V0, C), (extelt V1, C) --> extelt (opcode V0, V1), C
/// Unless the vector op is much more expensive than the scalar op, this
/// eliminates an extract.
static cl::opt<bool> DisableBinopExtractShuffle(
"disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
cl::desc("Disable binop extract to shuffle transforms"));


/// Compare the relative costs of 2 extracts followed by scalar operation vs.
/// vector operation(s) followed by extract. Return true if the existing
/// instructions are cheaper than a vector alternative. Otherwise, return false
/// and if one of the extracts should be transformed to a shufflevector, set
/// \p ConvertToShuffle to that extract instruction.
static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1,
unsigned Opcode,
const TargetTransformInfo &TTI) {
const TargetTransformInfo &TTI,
Instruction *&ConvertToShuffle) {
assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
(cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue() ==
cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue()) &&
"Expected same constant extract index");

isa<ConstantInt>(Ext1->getOperand(1)) &&
"Expected constant extract indexes");
Type *ScalarTy = Ext0->getType();
Type *VecTy = Ext0->getOperand(0)->getType();
int ScalarOpCost, VectorOpCost;
Expand All @@ -69,31 +73,73 @@ static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1,
CmpInst::makeCmpResultType(VecTy));
}

// Get cost estimate for the extract element. This cost will factor into
// Get cost estimates for the extract elements. These costs will factor into
// both sequences.
unsigned ExtIndex = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
int ExtractCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
VecTy, ExtIndex);
unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();

int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
VecTy, Ext0Index);
int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
VecTy, Ext1Index);

// A more expensive extract will always be replaced by a splat shuffle.
// For example, if Ext0 is more expensive:
// opcode (extelt V0, Ext0), (ext V1, Ext1) -->
// extelt (opcode (splat V0, Ext0), V1), Ext1
// TODO: Evaluate whether that always results in lowest cost. Alternatively,
// check the cost of creating a broadcast shuffle and shuffling both
// operands to element 0.
int CheapExtractCost = std::min(Extract0Cost, Extract1Cost);

// Extra uses of the extracts mean that we include those costs in the
// vector total because those instructions will not be eliminated.
int OldCost, NewCost;
if (Ext0->getOperand(0) == Ext1->getOperand(0)) {
// Handle a special case. If the 2 operands are identical, adjust the
if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
// Handle a special case. If the 2 extracts are identical, adjust the
// formulas to account for that. The extra use charge allows for either the
// CSE'd pattern or an unoptimized form with identical values:
// opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
: !Ext0->hasOneUse() || !Ext1->hasOneUse();
OldCost = ExtractCost + ScalarOpCost;
NewCost = VectorOpCost + ExtractCost + HasUseTax * ExtractCost;
OldCost = CheapExtractCost + ScalarOpCost;
NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
} else {
// Handle the general case. Each extract is actually a different value:
// opcode (extelt V0, C), (extelt V1, C) --> extelt (opcode V0, V1), C
OldCost = 2 * ExtractCost + ScalarOpCost;
NewCost = VectorOpCost + ExtractCost + !Ext0->hasOneUse() * ExtractCost +
!Ext1->hasOneUse() * ExtractCost;
// opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
NewCost = VectorOpCost + CheapExtractCost +
!Ext0->hasOneUse() * Extract0Cost +
!Ext1->hasOneUse() * Extract1Cost;
}

if (Ext0Index == Ext1Index) {
// If the extract indexes are identical, no shuffle is needed.
ConvertToShuffle = nullptr;
} else {
if (IsBinOp && DisableBinopExtractShuffle)
return true;

// If we are extracting from 2 different indexes, then one operand must be
// shuffled before performing the vector operation. The shuffle mask is
// undefined except for 1 lane that is being translated to the remaining
// extraction lane. Therefore, it is a splat shuffle. Ex:
// ShufMask = { undef, undef, 0, undef }
// TODO: The cost model has an option for a "broadcast" shuffle
// (splat-from-element-0), but no option for a more general splat.
NewCost +=
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);

// The more expensive extract will be replaced by a shuffle. If the extracts
// have the same cost, replace the extract with the higher index.
if (Extract0Cost > Extract1Cost)
ConvertToShuffle = Ext0;
else if (Extract1Cost > Extract0Cost)
ConvertToShuffle = Ext1;
else
ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1;
}

// Aggressively form a vector op if the cost is equal because the transform
// may enable further optimization.
// Codegen can reverse this transform (scalarize) if it was not profitable.
Expand Down Expand Up @@ -162,12 +208,33 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) {
V0->getType() != V1->getType())
return false;

// TODO: Handle C0 != C1 by shuffling 1 of the operands.
if (C0 != C1)
Instruction *ConvertToShuffle;
if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle))
return false;

if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI))
return false;
if (ConvertToShuffle) {
// The shuffle mask is undefined except for 1 lane that is being translated
// to the cheap extraction lane. Example:
// ShufMask = { 2, undef, undef, undef }
uint64_t SplatIndex = ConvertToShuffle == Ext0 ? C0 : C1;
uint64_t CheapExtIndex = ConvertToShuffle == Ext0 ? C1 : C0;
Type *VecTy = V0->getType();
Type *I32Ty = IntegerType::getInt32Ty(I.getContext());
UndefValue *Undef = UndefValue::get(I32Ty);
SmallVector<Constant *, 32> ShufMask(VecTy->getVectorNumElements(), Undef);
ShufMask[CheapExtIndex] = ConstantInt::get(I32Ty, SplatIndex);
IRBuilder<> Builder(ConvertToShuffle);

// extelt X, C --> extelt (splat X), C'
Value *Shuf = Builder.CreateShuffleVector(ConvertToShuffle->getOperand(0),
UndefValue::get(VecTy),
ConstantVector::get(ShufMask));
Value *NewExt = Builder.CreateExtractElement(Shuf, CheapExtIndex);
if (ConvertToShuffle == Ext0)
Ext0 = cast<Instruction>(NewExt);
else
Ext1 = cast<Instruction>(NewExt);
}

if (Pred != CmpInst::BAD_ICMP_PREDICATE)
foldExtExtCmp(Ext0, Ext1, I, TTI);
Expand Down
97 changes: 63 additions & 34 deletions llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
Expand Up @@ -251,14 +251,18 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
ret i8 %r
}

; TODO: Different extract indexes requires a shuffle.

define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext0_ext1_add(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
; CHECK-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
; SSE-LABEL: @ext0_ext1_add(
; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
; SSE-NEXT: [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
; SSE-NEXT: ret i8 [[R]]
;
; AVX-LABEL: @ext0_ext1_add(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = add nuw <16 x i8> [[X:%.*]], [[TMP1]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
; AVX-NEXT: ret i8 [[TMP3]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 1
Expand All @@ -267,11 +271,17 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
}

define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext5_ext0_add(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
; CHECK-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
; SSE-LABEL: @ext5_ext0_add(
; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
; SSE-NEXT: [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
; SSE-NEXT: ret i8 [[R]]
;
; AVX-LABEL: @ext5_ext0_add(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> undef, <16 x i32> <i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = sub nsw <16 x i8> [[TMP1]], [[Y:%.*]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP2]], i64 0
; AVX-NEXT: ret i8 [[TMP3]]
;
%e0 = extractelement <16 x i8> %x, i32 5
%e1 = extractelement <16 x i8> %y, i32 0
Expand All @@ -280,11 +290,17 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
}

define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext6_add(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
; CHECK-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
; SSE-LABEL: @ext1_ext6_add(
; SSE-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
; SSE-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
; SSE-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]]
; SSE-NEXT: ret i8 [[R]]
;
; AVX-LABEL: @ext1_ext6_add(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = and <16 x i8> [[X:%.*]], [[TMP1]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1
; AVX-NEXT: ret i8 [[TMP3]]
;
%e0 = extractelement <16 x i8> %x, i32 1
%e1 = extractelement <16 x i8> %y, i32 6
Expand All @@ -294,10 +310,10 @@ define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {

define float @ext1_ext0_fmul(<4 x float> %x) {
; CHECK-LABEL: @ext1_ext0_fmul(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 1
; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x float> [[X]], i32 0
; CHECK-NEXT: [[R:%.*]] = fmul float [[E0]], [[E1]]
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], [[X]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i64 0
; CHECK-NEXT: ret float [[TMP3]]
;
%e0 = extractelement <4 x float> %x, i32 1
%e1 = extractelement <4 x float> %x, i32 0
Expand All @@ -309,9 +325,10 @@ define float @ext0_ext3_fmul_extra_use1(<4 x float> %x) {
; CHECK-LABEL: @ext0_ext3_fmul_extra_use1(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
; CHECK-NEXT: call void @use_f32(float [[E0]])
; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x float> [[X]], i32 3
; CHECK-NEXT: [[R:%.*]] = fmul nnan float [[E0]], [[E1]]
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = fmul nnan <4 x float> [[X]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: ret float [[TMP3]]
;
%e0 = extractelement <4 x float> %x, i32 0
call void @use_f32(float %e0)
Expand All @@ -336,11 +353,17 @@ define float @ext0_ext3_fmul_extra_use2(<4 x float> %x) {
}

define float @ext0_ext4_fmul_v8f32(<8 x float> %x) {
; CHECK-LABEL: @ext0_ext4_fmul_v8f32(
; CHECK-NEXT: [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 0
; CHECK-NEXT: [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
; CHECK-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]]
; CHECK-NEXT: ret float [[R]]
; SSE-LABEL: @ext0_ext4_fmul_v8f32(
; SSE-NEXT: [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 0
; SSE-NEXT: [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
; SSE-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]]
; SSE-NEXT: ret float [[R]]
;
; AVX-LABEL: @ext0_ext4_fmul_v8f32(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> undef, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[X]], [[TMP1]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
; AVX-NEXT: ret float [[TMP3]]
;
%e0 = extractelement <8 x float> %x, i32 0
%e1 = extractelement <8 x float> %x, i32 4
Expand All @@ -349,11 +372,17 @@ define float @ext0_ext4_fmul_v8f32(<8 x float> %x) {
}

define float @ext7_ext4_fmul_v8f32(<8 x float> %x) {
; CHECK-LABEL: @ext7_ext4_fmul_v8f32(
; CHECK-NEXT: [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
; CHECK-NEXT: [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
; CHECK-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]]
; CHECK-NEXT: ret float [[R]]
; SSE-LABEL: @ext7_ext4_fmul_v8f32(
; SSE-NEXT: [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
; SSE-NEXT: [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
; SSE-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]]
; SSE-NEXT: ret float [[R]]
;
; AVX-LABEL: @ext7_ext4_fmul_v8f32(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[TMP1]], [[X]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i64 4
; AVX-NEXT: ret float [[TMP3]]
;
%e0 = extractelement <8 x float> %x, i32 7
%e1 = extractelement <8 x float> %x, i32 4
Expand Down
40 changes: 26 additions & 14 deletions llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
Expand Up @@ -102,11 +102,17 @@ f:
}

define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @cmp01_v2f64(
; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
; CHECK-NEXT: ret i1 [[CMP]]
; SSE-LABEL: @cmp01_v2f64(
; SSE-NEXT: [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
; SSE-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
; SSE-NEXT: [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
; SSE-NEXT: ret i1 [[CMP]]
;
; AVX-LABEL: @cmp01_v2f64(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = fcmp oge <2 x double> [[X:%.*]], [[TMP1]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; AVX-NEXT: ret i1 [[TMP3]]
;
%x0 = extractelement <2 x double> %x, i32 0
%y1 = extractelement <2 x double> %y, i32 1
Expand All @@ -115,11 +121,17 @@ define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
}

define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @cmp10_v2f64(
; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; CHECK-NEXT: [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
; CHECK-NEXT: [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
; CHECK-NEXT: ret i1 [[CMP]]
; SSE-LABEL: @cmp10_v2f64(
; SSE-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; SSE-NEXT: [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
; SSE-NEXT: [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
; SSE-NEXT: ret i1 [[CMP]]
;
; AVX-LABEL: @cmp10_v2f64(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = fcmp ule <2 x double> [[TMP1]], [[Y:%.*]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
; AVX-NEXT: ret i1 [[TMP3]]
;
%x1 = extractelement <2 x double> %x, i32 1
%y0 = extractelement <2 x double> %y, i32 0
Expand All @@ -129,10 +141,10 @@ define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {

define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: @cmp12_v4i32(
; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X1]], [[Y2]]
; CHECK-NEXT: ret i1 [[CMP]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
; CHECK-NEXT: ret i1 [[TMP3]]
;
%x1 = extractelement <4 x i32> %x, i32 1
%y2 = extractelement <4 x i32> %y, i32 2
Expand Down

0 comments on commit a69158c

Please sign in to comment.