Skip to content

Commit

Permalink
Revert "[SLP]Improve shuffles cost estimation where possible."
Browse files Browse the repository at this point in the history
This reverts commit 9980c99.

Caused assertion failures: https://reviews.llvm.org/D115462#3555350
  • Loading branch information
MaskRay committed Jun 3, 2022
1 parent 72de758 commit df0f30d
Show file tree
Hide file tree
Showing 11 changed files with 301 additions and 282 deletions.
101 changes: 33 additions & 68 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -5557,17 +5557,17 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
for (auto *V : VL) {
++Idx;

// Need to exclude undefs from analysis.
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
continue;

// Reached the start of a new vector registers.
if (Idx % EltsPerVector == 0) {
RegMask.assign(EltsPerVector, UndefMaskElem);
AllConsecutive = true;
continue;
}

// Need to exclude undefs from analysis.
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
continue;

// Check all extracts for a vector register on the target directly
// extract values in order.
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
Expand Down Expand Up @@ -6012,42 +6012,23 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
assert(E->ReuseShuffleIndices.empty() &&
"Unique insertelements only are expected.");
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());

unsigned const NumElts = SrcVecTy->getNumElements();
unsigned const NumScalars = VL.size();

unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);

unsigned OffsetBeg = *getInsertIndex(VL.front());
unsigned OffsetEnd = OffsetBeg;
for (Value *V : VL.drop_front()) {
unsigned Idx = *getInsertIndex(V);
if (OffsetBeg > Idx)
OffsetBeg = Idx;
else if (OffsetEnd < Idx)
OffsetEnd = Idx;
}
unsigned VecSz = NumElts;
unsigned VecScalarsSz = NumScalars;
if (NumOfParts > 0) {
VecScalarsSz = (NumElts + NumOfParts - 1) / NumOfParts;
VecSz = PowerOf2Ceil(
(1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
VecScalarsSz);
}

APInt DemandedElts = APInt::getZero(NumElts);
// TODO: Add support for Instruction::InsertValue.
SmallVector<int> Mask;
if (!E->ReorderIndices.empty()) {
inversePermutation(E->ReorderIndices, Mask);
Mask.append(NumElts - NumScalars, UndefMaskElem);
} else {
Mask.assign(VecSz, UndefMaskElem);
std::iota(Mask.begin(), std::next(Mask.begin(), VecSz), 0);
Mask.assign(NumElts, UndefMaskElem);
std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
}
unsigned Offset = *getInsertIndex(VL0);
bool IsIdentity = true;
SmallVector<int> PrevMask(VecSz, UndefMaskElem);
SmallVector<int> PrevMask(NumElts, UndefMaskElem);
Mask.swap(PrevMask);
unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
for (unsigned I = 0; I < NumScalars; ++I) {
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
DemandedElts.setBit(InsertIdx);
Expand All @@ -6060,45 +6041,32 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
/*Insert*/ true, /*Extract*/ false);

// First cost - resize to actual vector size if not identity shuffle or
// need to shift the vector.
// Do not calculate the cost if the actual size is the register size and
// we can merge this shuffle with the following SK_Select.
auto *ActualVecTy =
FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
if ((!IsIdentity || Offset != OffsetBeg) && VecScalarsSz != VecSz)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
ActualVecTy, Mask);
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
}));
// Second cost - permutation with subvector, if some elements are from the
// initial vector or inserting a subvector.
// TODO: Implement the analysis of the FirstInsert->getOperand(0)
// subvector of ActualVecTy.
if (!isUndefVector(FirstInsert->getOperand(0)) &&
(Offset != OffsetBeg || (OffsetEnd + 1) % VecScalarsSz != 0)) {
unsigned InsertVecSz = PowerOf2Ceil(OffsetEnd - OffsetBeg + 1);
if (InsertVecSz != VecSz) {
Cost += TTI->getShuffleCost(
TTI::SK_InsertSubvector,
(InsertVecSz < VecScalarsSz && NumOfParts > 0)
? FixedVectorType::get(SrcVecTy->getElementType(),
VecScalarsSz)
: ActualVecTy,
None, OffsetBeg - Offset,
FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz));
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
// FIXME: Replace with SK_InsertSubvector once it is properly supported.
unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
Cost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc,
FixedVectorType::get(SrcVecTy->getElementType(), Sz));
} else if (!IsIdentity) {
auto *FirstInsert =
cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
return !is_contained(E->Scalars,
cast<Instruction>(V)->getOperand(0));
}));
if (isUndefVector(FirstInsert->getOperand(0))) {
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
} else {
for (unsigned I = 0; I < OffsetBeg; ++I)
Mask[I] = I;
for (unsigned I = OffsetBeg; I <= OffsetEnd; ++I)
SmallVector<int> InsertMask(NumElts);
std::iota(InsertMask.begin(), InsertMask.end(), 0);
for (unsigned I = 0; I < NumElts; I++) {
if (Mask[I] != UndefMaskElem)
Mask[I] = I + VecSz;
for (unsigned I = OffsetEnd + 1; I < VecSz; ++I)
Mask[I] = I;
Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, ActualVecTy, Mask);
InsertMask[Offset + I] = NumElts + I;
}
Cost +=
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
}
}

return Cost;
}
case Instruction::ZExt:
Expand Down Expand Up @@ -6551,10 +6519,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// No need to vectorize inserts of gathered values.
if (VectorizableTree.size() == 2 &&
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
VectorizableTree[1]->State == TreeEntry::NeedToGather &&
(VectorizableTree[1]->getVectorFactor() <= 2 ||
!(isSplat(VectorizableTree[1]->Scalars) ||
allConstant(VectorizableTree[1]->Scalars))))
VectorizableTree[1]->State == TreeEntry::NeedToGather)
return true;

// We can vectorize the tree if its size is greater than or equal to the
Expand Down
Expand Up @@ -22,16 +22,16 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
Expand Down Expand Up @@ -218,16 +218,16 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
Expand Down Expand Up @@ -301,16 +301,16 @@ define <4 x float> @log_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
Expand Down Expand Up @@ -477,16 +477,16 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
Expand Down Expand Up @@ -519,16 +519,16 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
Expand Down Expand Up @@ -1010,16 +1010,16 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
Expand Down

0 comments on commit df0f30d

Please sign in to comment.