232 changes: 150 additions & 82 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,24 +536,26 @@ static bool isValidForAlternation(unsigned Opcode) {
}

static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI,
unsigned BaseIndex = 0);

/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
/// compatible instructions or constants, or just some other regular values.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
Value *Op1) {
Value *Op1, const TargetLibraryInfo &TLI) {
return (isConstant(BaseOp0) && isConstant(Op0)) ||
(isConstant(BaseOp1) && isConstant(Op1)) ||
(!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
!isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
getSameOpcode({BaseOp0, Op0}).getOpcode() ||
getSameOpcode({BaseOp1, Op1}).getOpcode();
getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
}

/// \returns true if a compare instruction \p CI has similar "look" and
/// same predicate as \p BaseCI, "as is" or with its operands and predicate
/// swapped, false otherwise.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI) {
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
const TargetLibraryInfo &TLI) {
assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
"Assessing comparisons of different types?");
CmpInst::Predicate BasePred = BaseCI->getPredicate();
Expand All @@ -566,15 +568,16 @@ static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI) {
Value *Op1 = CI->getOperand(1);

return (BasePred == Pred &&
areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) ||
areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
(BasePred == SwappedPred &&
areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0));
areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
}

/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI,
unsigned BaseIndex) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
Expand All @@ -592,9 +595,19 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

// Check for one alternate opcode from another BinaryOperator.
// TODO - generalize to support all operators (types, calls etc.).
auto *IBase = cast<Instruction>(VL[BaseIndex]);
Intrinsic::ID BaseID = 0;
SmallVector<VFInfo> BaseMappings;
if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
auto *I = cast<Instruction>(VL[Cnt]);
unsigned InstOpcode = I->getOpcode();
if (IsBinOp && isa<BinaryOperator>(I)) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
Expand All @@ -603,9 +616,11 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltIndex = Cnt;
continue;
}
} else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
} else if (IsCastOp && isa<CastInst>(I)) {
Value *Op0 = IBase->getOperand(0);
Type *Ty0 = Op0->getType();
Value *Op1 = I->getOperand(0);
Type *Ty1 = Op1->getType();
if (Ty0 == Ty1) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
Expand Down Expand Up @@ -634,11 +649,11 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
(BasePred == CurrentPred || BasePred == SwappedCurrentPred))
continue;

if (isCmpSameOrSwapped(BaseInst, Inst))
if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
continue;
auto *AltInst = cast<CmpInst>(VL[AltIndex]);
if (AltIndex != BaseIndex) {
if (isCmpSameOrSwapped(AltInst, Inst))
if (isCmpSameOrSwapped(AltInst, Inst, TLI))
continue;
} else if (BasePred != CurrentPred) {
assert(
Expand All @@ -652,8 +667,45 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltPred == CurrentPred || AltPred == SwappedCurrentPred)
continue;
}
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
if (Gep->getNumOperands() != 2 ||
Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
} else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
if (!isVectorLikeInstWithConstOps(EI))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
} else if (auto *LI = dyn_cast<LoadInst>(I)) {
auto *BaseLI = cast<LoadInst>(IBase);
if (!LI->isSimple() || !BaseLI->isSimple())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
} else if (auto *Call = dyn_cast<CallInst>(I)) {
auto *CallBase = cast<CallInst>(IBase);
if (Call->getCalledFunction() != CallBase->getCalledFunction())
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
if (Call->hasOperandBundles() &&
!std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
Call->op_begin() + Call->getBundleOperandsEndIndex(),
CallBase->op_begin() +
CallBase->getBundleOperandsStartIndex()))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
if (ID != BaseID)
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
if (!ID) {
SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
if (Mappings.size() != BaseMappings.size() ||
Mappings.front().ISA != BaseMappings.front().ISA ||
Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
Mappings.front().VectorName != BaseMappings.front().VectorName ||
Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
Mappings.front().Shape.Parameters !=
BaseMappings.front().Shape.Parameters)
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}
}
continue;
}
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
}

Expand Down Expand Up @@ -1097,16 +1149,19 @@ class BoUpSLP {

/// A helper class used for scoring candidates for two consecutive lanes.
class LookAheadHeuristics {
const TargetLibraryInfo &TLI;
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
int NumLanes; // Total number of lanes (aka vectorization factor).
int MaxLevel; // The maximum recursion depth for accumulating score.

public:
LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,
const BoUpSLP &R, int NumLanes, int MaxLevel)
: DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
int MaxLevel)
: TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
MaxLevel(MaxLevel) {}

// The hard-coded scores listed here are not very important, though it shall
// be higher for better matches to improve the resulting cost. When
Expand All @@ -1128,6 +1183,8 @@ class BoUpSLP {
static const int ScoreSplatLoads = 3;
/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreReversedLoads = 3;
/// A load candidate for masked gather.
static const int ScoreMaskedGatherCandidate = 1;
/// ExtractElementInst from same vector and consecutive indexes.
static const int ScoreConsecutiveExtracts = 4;
/// ExtractElementInst from same vector and reversed indices.
Expand Down Expand Up @@ -1182,18 +1239,26 @@ class BoUpSLP {
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
if (LI1->getParent() != LI2->getParent())
if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
!LI2->isSimple())
return LookAheadHeuristics::ScoreFail;

Optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
if (!Dist || *Dist == 0)
if (!Dist || *Dist == 0) {
if (getUnderlyingObject(LI1->getPointerOperand()) ==
getUnderlyingObject(LI2->getPointerOperand()) &&
R.TTI->isLegalMaskedGather(
FixedVectorType::get(LI1->getType(), NumLanes),
LI1->getAlign()))
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
return LookAheadHeuristics::ScoreFail;
}
// The distance is too large - still may be profitable to use masked
// loads/gathers.
if (std::abs(*Dist) > NumLanes / 2)
return LookAheadHeuristics::ScoreAltOpcodes;
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
// This still will detect consecutive loads, but we might have "holes"
// in some cases. It is ok for non-power-2 vectorization and may produce
// better results. It should not affect current vectorization.
Expand Down Expand Up @@ -1250,7 +1315,7 @@ class BoUpSLP {
SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
Ops.push_back(I1);
Ops.push_back(I2);
InstructionsState S = getSameOpcode(Ops);
InstructionsState S = getSameOpcode(Ops, TLI);
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
if (S.getOpcode() &&
Expand Down Expand Up @@ -1426,6 +1491,7 @@ class BoUpSLP {
/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;

const TargetLibraryInfo &TLI;
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
Expand Down Expand Up @@ -1527,7 +1593,7 @@ class BoUpSLP {
int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
int Lane, unsigned OpIdx, unsigned Idx,
bool &IsUsed) {
LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),
LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
LookAheadMaxDepth);
// Keep track of the instruction stack as we recurse into the operands
// during the look-ahead score exploration.
Expand Down Expand Up @@ -1749,7 +1815,7 @@ class BoUpSLP {
// Use Boyer-Moore majority voting for finding the majority opcode and
// the number of times it occurs.
if (auto *I = dyn_cast<Instruction>(OpData.V)) {
if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
I->getParent() != Parent) {
if (NumOpsWithSameOpcodeParent == 0) {
NumOpsWithSameOpcodeParent = 1;
Expand Down Expand Up @@ -1851,9 +1917,9 @@ class BoUpSLP {

public:
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
ScalarEvolution &SE, const BoUpSLP &R)
: DL(DL), SE(SE), R(R) {
VLOperands(ArrayRef<Value *> RootVL, const TargetLibraryInfo &TLI,
const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
: TLI(TLI), DL(DL), SE(SE), R(R) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
Expand Down Expand Up @@ -1994,7 +2060,7 @@ class BoUpSLP {
if (MainAltOps[OpIdx].size() != 2) {
OperandData &AltOp = getData(OpIdx, Lane);
InstructionsState OpS =
getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});
getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
if (OpS.getOpcode() && OpS.isAltShuffle())
MainAltOps[OpIdx].push_back(AltOp.V);
}
Expand Down Expand Up @@ -2069,7 +2135,7 @@ class BoUpSLP {
Optional<int>
findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
int Limit = LookAheadHeuristics::ScoreFail) {
LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
RootLookAheadMaxDepth);
int BestScore = Limit;
Optional<int> Index;
Expand Down Expand Up @@ -2244,12 +2310,10 @@ class BoUpSLP {

/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const DataLayout &DL,
ScalarEvolution &SE,
const BoUpSLP &R);
static void reorderInputsAccordingToOpcode(
ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI,
const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);

/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
Expand Down Expand Up @@ -2607,7 +2671,7 @@ class BoUpSLP {
return UndefValue::get(VL.front()->getType());
return VL[Idx];
});
InstructionsState S = getSameOpcode(Last->Scalars);
InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
Last->setOperations(S);
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
}
Expand Down Expand Up @@ -3540,6 +3604,7 @@ enum class LoadsState { Gather, Vectorize, ScatterVectorize };
} // anonymous namespace

static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
const TargetLibraryInfo &TLI,
bool CompareOpcodes = true) {
if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
return false;
Expand All @@ -3553,7 +3618,7 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
((isConstant(GEP1->getOperand(1)) &&
isConstant(GEP2->getOperand(1))) ||
!CompareOpcodes ||
getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)})
getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
.getOpcode());
}

Expand All @@ -3562,7 +3627,7 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
const TargetTransformInfo &TTI,
const DataLayout &DL, ScalarEvolution &SE,
LoopInfo &LI,
LoopInfo &LI, const TargetLibraryInfo &TLI,
SmallVectorImpl<unsigned> &Order,
SmallVectorImpl<Value *> &PointerOps) {
// Check that a vectorized load would load the same memory as a scalar
Expand Down Expand Up @@ -3592,8 +3657,8 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
Order.clear();
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {
return arePointersCompatible(P, PointerOps.front());
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
if (IsSorted) {
Value *Ptr0;
Expand Down Expand Up @@ -4715,7 +4780,8 @@ static std::pair<size_t, size_t> generateKeySubkey(
/// the given \p MainOp and \p AltOp instructions.
static bool isAlternateInstruction(const Instruction *I,
const Instruction *MainOp,
const Instruction *AltOp);
const Instruction *AltOp,
const TargetLibraryInfo &TLI);

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
Expand Down Expand Up @@ -4761,7 +4827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return true;
};

InstructionsState S = getSameOpcode(VL);
InstructionsState S = getSameOpcode(VL, *TLI);

// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
// a load), in which case peek through to include it in the tree, without
Expand Down Expand Up @@ -4969,7 +5035,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Reset S to make it GetElementPtr kind of node.
const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
assert(It != VL.end() && "Expected at least one GEP.");
S = getSameOpcode(*It);
S = getSameOpcode(*It, *TLI);
}

// Check that all of the users of the scalars that we want to vectorize are
Expand Down Expand Up @@ -5173,8 +5239,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<Value *> PointerOps;
OrdersType CurrentOrder;
TreeEntry *TE = nullptr;
switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,
PointerOps)) {
switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI,
CurrentOrder, PointerOps)) {
case LoadsState::Vectorize:
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
Expand Down Expand Up @@ -5285,7 +5351,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
Expand Down Expand Up @@ -5332,7 +5398,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
Expand Down Expand Up @@ -5643,7 +5709,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!CI || all_of(VL, [](Value *V) {
return cast<CmpInst>(V)->isCommutative();
})) {
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE,
*this);
} else {
auto *MainCI = cast<CmpInst>(S.MainOp);
auto *AltCI = cast<CmpInst>(S.AltOp);
Expand All @@ -5658,7 +5725,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);

if (isAlternateInstruction(Cmp, MainCI, AltCI)) {
if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
std::swap(LHS, RHS);
} else {
Expand Down Expand Up @@ -5952,16 +6019,17 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

static bool isAlternateInstruction(const Instruction *I,
const Instruction *MainOp,
const Instruction *AltOp) {
const Instruction *AltOp,
const TargetLibraryInfo &TLI) {
if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
auto *AltCI = cast<CmpInst>(AltOp);
CmpInst::Predicate MainP = MainCI->getPredicate();
CmpInst::Predicate AltP = AltCI->getPredicate();
assert(MainP != AltP && "Expected different main/alternate predicates.");
auto *CI = cast<CmpInst>(I);
if (isCmpSameOrSwapped(MainCI, CI))
if (isCmpSameOrSwapped(MainCI, CI, TLI))
return false;
if (isCmpSameOrSwapped(AltCI, CI))
if (isCmpSameOrSwapped(AltCI, CI, TLI))
return true;
CmpInst::Predicate P = CI->getPredicate();
CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);
Expand Down Expand Up @@ -6231,7 +6299,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
OrdersType CurrentOrder;
LoadsState LS =
canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
CurrentOrder, PointerOps);
*TLI, CurrentOrder, PointerOps);
switch (LS) {
case LoadsState::Vectorize:
case LoadsState::ScatterVectorize:
Expand Down Expand Up @@ -7666,15 +7734,13 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// Perform operand reordering on the instructions in VL and return the reordered
// operands in Left and Right.
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const DataLayout &DL,
ScalarEvolution &SE,
const BoUpSLP &R) {
void BoUpSLP::reorderInputsAccordingToOpcode(
ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right, const TargetLibraryInfo &TLI,
const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) {
if (VL.empty())
return;
VLOperands Ops(VL, DL, SE, R);
VLOperands Ops(VL, TLI, DL, SE, R);
// Reorder the operands in place.
Ops.reorder();
Left = Ops.getVL(0);
Expand Down Expand Up @@ -7967,13 +8033,13 @@ class ShuffleInstructionBuilder {

Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL);
InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
const auto *It =
find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });
if (It != VL.end())
S = getSameOpcode(*It);
S = getSameOpcode(*It, *TLI);
}
if (S.getOpcode()) {
if (TreeEntry *E = getTreeEntry(S.OpValue))
Expand Down Expand Up @@ -8691,9 +8757,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<int> Mask;
buildShuffleEntryMask(
E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
[E](Instruction *I) {
[E, this](Instruction *I) {
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
*TLI);
},
Mask, &OpScalars, &AltScalars);

Expand Down Expand Up @@ -10607,7 +10674,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

// Check that all of the parts are instructions of the same type,
// we permit an alternate opcode via InstructionsState.
InstructionsState S = getSameOpcode(VL);
InstructionsState S = getSameOpcode(VL, *TLI);
if (!S.getOpcode())
return false;

Expand Down Expand Up @@ -11252,7 +11319,7 @@ class HorizontalReduction {
}
for (LoadInst *RLI : LIt->second) {
if (arePointersCompatible(RLI->getPointerOperand(),
LI->getPointerOperand())) {
LI->getPointerOperand(), TLI)) {
hash_code SubKey = hash_value(RLI->getPointerOperand());
DoNotReverseVals.insert(RLI);
return SubKey;
Expand Down Expand Up @@ -11295,7 +11362,7 @@ class HorizontalReduction {
}
for (LoadInst *RLI : LIt->second) {
if (arePointersCompatible(RLI->getPointerOperand(),
LI->getPointerOperand())) {
LI->getPointerOperand(), TLI)) {
hash_code SubKey = hash_value(RLI->getPointerOperand());
DoNotReverseVals.insert(RLI);
return SubKey;
Expand Down Expand Up @@ -11366,7 +11433,8 @@ class HorizontalReduction {
}

/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI,
const TargetLibraryInfo &TLI) {
constexpr int ReductionLimit = 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
Expand Down Expand Up @@ -11439,7 +11507,7 @@ class HorizontalReduction {
// Try to vectorize elements based on their type.
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
InstructionsState S = getSameOpcode(OrigReducedVals);
InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
SmallVector<Value *> Candidates;
Candidates.reserve(2 * OrigReducedVals.size());
DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
Expand All @@ -11459,7 +11527,7 @@ class HorizontalReduction {
// Try to handle shuffled extractelements.
if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
I + 1 < E) {
InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
if (NextS.getOpcode() == Instruction::ExtractElement &&
!NextS.isAltShuffle()) {
SmallVector<Value *> CommonCandidates(Candidates);
Expand Down Expand Up @@ -12125,7 +12193,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI))
return HorRdx.tryToReduce(R, TTI);
return HorRdx.tryToReduce(R, TTI, *TLI);
}
return nullptr;
};
Expand Down Expand Up @@ -12318,7 +12386,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
/// predicate of the second or the operands IDs are less than the operands IDs
/// of the second cmp instruction.
template <bool IsCompatibility>
static bool compareCmp(Value *V, Value *V2,
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
function_ref<bool(Instruction *)> IsDeleted) {
auto *CI1 = cast<CmpInst>(V);
auto *CI2 = cast<CmpInst>(V2);
Expand Down Expand Up @@ -12354,7 +12422,7 @@ static bool compareCmp(Value *V, Value *V2,
if (auto *I2 = dyn_cast<Instruction>(Op2)) {
if (I1->getParent() != I2->getParent())
return false;
InstructionsState S = getSameOpcode({I1, I2});
InstructionsState S = getSameOpcode({I1, I2}, TLI);
if (S.getOpcode())
continue;
return false;
Expand Down Expand Up @@ -12408,15 +12476,15 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,
}
// Try to vectorize list of compares.
// Sort by type, compare predicate, etc.
auto &&CompareSorter = [&R](Value *V, Value *V2) {
return compareCmp<false>(V, V2,
auto CompareSorter = [&](Value *V, Value *V2) {
return compareCmp<false>(V, V2, *TLI,
[&R](Instruction *I) { return R.isDeleted(I); });
};

auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
if (V1 == V2)
return true;
return compareCmp<true>(V1, V2,
return compareCmp<true>(V1, V2, *TLI,
[&R](Instruction *I) { return R.isDeleted(I); });
};
auto Limit = [&R](Value *V) {
Expand Down Expand Up @@ -12498,7 +12566,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
"Different nodes should have different DFS numbers");
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
InstructionsState S = getSameOpcode({I1, I2});
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
if (S.getOpcode())
continue;
return I1->getOpcode() < I2->getOpcode();
Expand All @@ -12515,7 +12583,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
return ConstOrder && *ConstOrder;
};
auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
if (V1 == V2)
return true;
if (V1->getType() != V2->getType())
Expand All @@ -12532,7 +12600,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
if (I1->getParent() != I2->getParent())
return false;
InstructionsState S = getSameOpcode({I1, I2});
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
if (S.getOpcode())
continue;
return false;
Expand Down Expand Up @@ -12835,7 +12903,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
"Different nodes should have different DFS numbers");
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
InstructionsState S = getSameOpcode({I1, I2});
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
if (S.getOpcode())
return false;
return I1->getOpcode() < I2->getOpcode();
Expand All @@ -12847,7 +12915,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
V2->getValueOperand()->getValueID();
};

auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) {
auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
if (V1 == V2)
return true;
if (V1->getPointerOperandType() != V2->getPointerOperandType())
Expand All @@ -12860,7 +12928,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
if (I1->getParent() != I2->getParent())
return false;
InstructionsState S = getSameOpcode({I1, I2});
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
return S.getOpcode() > 0;
}
if (isa<Constant>(V1->getValueOperand()) &&
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ define void @test2(double* %a, double* %b, i8* %e) {
; Don't vectorize volatile loads.
define void @test_volatile_load(double* %a, double* %b, double* %c) {
; CHECK-LABEL: @test_volatile_load(
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A]], align 8
; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B]], align 8
; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@ define void @test() {
; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 5), align 8
; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 6), align 8
; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 7), align 8
; CHECK-NEXT: [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]])
; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]])
; CHECK-NEXT: [[SQRT0:%.*]] = call fast double @llvm.sqrt.f64(double [[A0]])
; CHECK-NEXT: [[SQRT1:%.*]] = call fast double @llvm.sqrt.f64(double [[A1]])
; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]])
; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]])
; CHECK-NEXT: [[SQRT2:%.*]] = call fast double @llvm.sqrt.f64(double [[A4]])
; CHECK-NEXT: [[SQRT3:%.*]] = call fast double @llvm.sqrt.f64(double [[A5]])
; CHECK-NEXT: [[RES1:%.*]] = fadd fast double [[SQRT0]], [[SIN1]]
; CHECK-NEXT: [[RES2:%.*]] = fadd fast double [[SIN0]], [[SQRT1]]
; CHECK-NEXT: [[RES00:%.*]] = fadd fast double [[RES1]], [[RES2]]
; CHECK-NEXT: [[RES3:%.*]] = fadd fast double [[SQRT2]], [[SIN3]]
; CHECK-NEXT: [[RES4:%.*]] = fadd fast double [[SIN2]], [[SQRT3]]
; CHECK-NEXT: [[RES01:%.*]] = fadd fast double [[RES3]], [[RES4]]
; CHECK-NEXT: store double [[RES00]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst, i32 0, i64 0), align 8
; CHECK-NEXT: store double [[RES01]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst, i32 0, i64 1), align 8
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]])
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
; CHECK-NEXT: store <2 x double> [[TMP15]], <2 x double>* bitcast ([8 x double]* @dst to <2 x double>*), align 8
; CHECK-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 0), align 8
Expand Down