Skip to content

Commit

Permalink
[LV] Support vectorization of interleave-groups that require an epilo…
Browse files Browse the repository at this point in the history
…g under

optsize using masked wide loads 

Under Opt for Size, the vectorizer does not vectorize interleave-groups that
have gaps at the end of the group (such as a loop that reads only the even
elements: a[2*i]) because that implies that we'll require a scalar epilogue
(which is not allowed under Opt for Size). This patch extends the support for
masked-interleave-groups (introduced by D53011 for conditional accesses) to
also cover the case of gaps in a group of loads; Targets that enable the
masked-interleave-group feature don't have to invalidate interleave-groups of
loads with gaps; they could now use masked wide-loads and shuffles (if that's
what the cost model selects).

Reviewers: Ayal, hsaito, dcaballe, fhahn

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D53668

llvm-svn: 345705
  • Loading branch information
dnuzman committed Oct 31, 2018
1 parent 889356e commit 34da6dd
Show file tree
Hide file tree
Showing 20 changed files with 453 additions and 153 deletions.
20 changes: 13 additions & 7 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Expand Up @@ -588,7 +588,8 @@ class TargetTransformInfo {
bool enableInterleavedAccessVectorization() const;

/// Enable matching of interleaved access groups that contain predicated
/// accesses and are vectorized using masked vector loads/stores.
/// accesses or gaps and therefore vectorized using masked
/// vector loads/stores.
bool enableMaskedInterleavedAccessVectorization() const;

/// Indicate that it is potentially unsafe to automatically vectorize
Expand Down Expand Up @@ -827,11 +828,13 @@ class TargetTransformInfo {
/// load allows gaps)
/// \p Alignment is the alignment of the memory operation
/// \p AddressSpace is address space of the pointer.
/// \p IsMasked indicates if the memory access is predicated.
/// \p UseMaskForCond indicates if the memory access is predicated.
/// \p UseMaskForGaps indicates if gaps should be masked.
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace,
bool IsMasked = false) const;
unsigned AddressSpace,
bool UseMaskForCond = false,
bool UseMaskForGaps = false) const;

/// Calculate the cost of performing a vector reduction.
///
Expand Down Expand Up @@ -1142,7 +1145,8 @@ class TargetTransformInfo::Concept {
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
bool IsMasked = false) = 0;
bool UseMaskForCond = false,
bool UseMaskForGaps = false) = 0;
virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) = 0;
virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
Expand Down Expand Up @@ -1484,9 +1488,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
}
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace, bool IsMasked) override {
unsigned AddressSpace, bool UseMaskForCond,
bool UseMaskForGaps) override {
return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, IsMasked);
Alignment, AddressSpace,
UseMaskForCond, UseMaskForGaps);
}
int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) override {
Expand Down
3 changes: 2 additions & 1 deletion llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Expand Up @@ -453,7 +453,8 @@ class TargetTransformInfoImplBase {
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
bool IsMasked = false) {
bool UseMaskForCond = false,
bool UseMaskForGaps = false) {
return 1;
}

Expand Down
19 changes: 17 additions & 2 deletions llvm/include/llvm/Analysis/VectorUtils.h
Expand Up @@ -24,6 +24,7 @@ namespace llvm {
template <typename T> class ArrayRef;
class DemandedBits;
class GetElementPtrInst;
class InterleaveGroup;
class Loop;
class ScalarEvolution;
class TargetTransformInfo;
Expand Down Expand Up @@ -125,6 +126,20 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
/// This function always sets a (possibly null) value for each K in Kinds.
Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);

/// Create a mask that filters the members of an interleave group where there
/// are gaps.
///
/// For example, the mask for \p Group with interleave-factor 3
/// and \p VF 4, that has only its first member present is:
///
/// <1,0,0,1,0,0,1,0,0,1,0,0>
///
/// Note: The result is a mask of 0's and 1's, as opposed to the other
/// create[*]Mask() utilities which create a shuffle mask (mask that
/// consists of indices).
Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
const InterleaveGroup &Group);

/// Create a mask with replicated elements.
///
/// This function creates a shuffle mask for replicating each of the \p VF
Expand Down Expand Up @@ -406,8 +421,8 @@ class InterleavedAccessInfo {
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }

/// Invalidate groups that require a scalar epilogue (due to gaps). This can
/// happen when we optimize for size and don't allow creating a scalar
/// epilogue.
/// happen when optimizing for size forbids a scalar epilogue, and the gap
/// cannot be filtered by masking the load/store.
void invalidateGroupsRequiringScalarEpilogue();

private:
Expand Down
16 changes: 13 additions & 3 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Expand Up @@ -804,7 +804,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
bool IsMasked = false) {
bool UseMaskForCond = false,
bool UseMaskForGaps = false) {
VectorType *VT = dyn_cast<VectorType>(VecTy);
assert(VT && "Expect a vector type for interleaved memory op");

Expand All @@ -816,7 +817,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {

// Firstly, the cost of load/store operation.
unsigned Cost;
if (IsMasked)
if (UseMaskForCond || UseMaskForGaps)
Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
Opcode, VecTy, Alignment, AddressSpace);
else
Expand Down Expand Up @@ -917,7 +918,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
->getVectorInstrCost(Instruction::InsertElement, VT, i);
}

if (!IsMasked)
if (!UseMaskForCond)
return Cost;

Type *I8Type = Type::getInt8Ty(VT->getContext());
Expand All @@ -942,6 +943,15 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
Cost += static_cast<T *>(this)->getVectorInstrCost(
Instruction::InsertElement, MaskVT, i);

// The Gaps mask is invariant and created outside the loop, therefore the
// cost of creating it is not accounted for here. However if we have both
// a MaskForGaps and some other mask that guards the execution of the
// memory access, we need to account for the cost of And-ing the two masks
// inside the loop.
if (UseMaskForGaps)
Cost += static_cast<T *>(this)->getArithmeticInstrCost(
BinaryOperator::And, MaskVT);

return Cost;
}

Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Expand Up @@ -519,9 +519,12 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,

int TargetTransformInfo::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
int Cost = TTIImpl->getInterleavedMemoryOpCost(
Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
bool UseMaskForGaps) const {
int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace,
UseMaskForCond,
UseMaskForGaps);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
Expand Down
24 changes: 22 additions & 2 deletions llvm/lib/Analysis/VectorUtils.cpp
Expand Up @@ -504,6 +504,25 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
return Inst;
}

Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
const InterleaveGroup &Group) {
// All 1's means mask is not needed.
if (Group.getNumMembers() == Group.getFactor())
return nullptr;

// TODO: support reversed access.
assert(!Group.isReverse() && "Reversed group not supported.");

SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < VF; i++)
for (unsigned j = 0; j < Group.getFactor(); ++j) {
unsigned HasMember = Group.getMember(j) ? 1 : 0;
Mask.push_back(Builder.getInt1(HasMember));
}

return ConstantVector::get(Mask);
}

Constant *llvm::createReplicatedMask(IRBuilder<> &Builder,
unsigned ReplicationFactor, unsigned VF) {
SmallVector<Constant *, 16> MaskVec;
Expand Down Expand Up @@ -935,9 +954,10 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
}
for (auto *Ptr : DelSet) {
LLVM_DEBUG(
dbgs()
dbgs()
<< "LV: Invalidate candidate interleaved group due to gaps that "
"require a scalar epilogue.\n");
"require a scalar epilogue (not allowed under optsize) and cannot "
"be masked (not enabled). \n");
releaseGroup(Ptr);
}

Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Expand Up @@ -660,11 +660,13 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
bool IsMasked) {
bool UseMaskForCond,
bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");

if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
if (!UseMaskForCond && !UseMaskForGaps &&
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);

Expand All @@ -677,7 +679,8 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, IsMasked);
Alignment, AddressSpace,
UseMaskForCond, UseMaskForGaps);
}

int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Expand Up @@ -146,7 +146,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {

int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace, bool IsMasked = false);
unsigned AddressSpace,
bool UseMaskForCond = false,
bool UseMaskForGaps = false);

bool
shouldConsiderAddressTypePromotion(const Instruction &I,
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Expand Up @@ -564,15 +564,16 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
bool IsMasked) {
bool UseMaskForCond,
bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");

// vldN/vstN doesn't support vector types of i64/f64 element.
bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;

if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
!IsMasked) {
!UseMaskForCond && !UseMaskForGaps) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);

Expand All @@ -585,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, IsMasked);
Alignment, AddressSpace,
UseMaskForCond, UseMaskForGaps);
}

void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/ARM/ARMTargetTransformInfo.h
Expand Up @@ -169,7 +169,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {

int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace, bool IsMasked);
unsigned AddressSpace,
bool UseMaskForCond = false,
bool UseMaskForGaps = false);

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
Expand Up @@ -206,10 +206,12 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,

unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
if (Indices.size() != Factor || IsMasked)
unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
bool UseMaskForGaps) {
if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, IsMasked);
Alignment, AddressSpace,
UseMaskForCond, UseMaskForGaps);
return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
}

Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
Expand Up @@ -123,7 +123,8 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
bool VariableMask, unsigned Alignment);
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace, bool IsMasked);
unsigned AddressSpace, bool UseMaskForCond = false,
bool UseMaskForGaps = false);
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I);
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
Expand Up @@ -474,10 +474,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
bool IsMasked) {
if (IsMasked)
bool UseMaskForCond,
bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, IsMasked);
Alignment, AddressSpace,
UseMaskForCond, UseMaskForGaps);

assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
Expand Up @@ -91,7 +91,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
bool IsMasked = false);
bool UseMaskForCond = false,
bool UseMaskForGaps = false);

/// @}
};
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
Expand Up @@ -969,10 +969,12 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
bool IsMasked) {
if (IsMasked)
bool UseMaskForCond,
bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, IsMasked);
Alignment, AddressSpace,
UseMaskForCond, UseMaskForGaps);
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");

Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
Expand Up @@ -93,7 +93,9 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace, bool IsMasked = false);
unsigned AddressSpace,
bool UseMaskForCond = false,
bool UseMaskForGaps = false);
/// @}
};

Expand Down

0 comments on commit 34da6dd

Please sign in to comment.