Skip to content

Commit

Permalink
[LV][AArch64] Allow (limited) interleaving for scalable vectors
Browse files Browse the repository at this point in the history
This patch uses the (de)interleaving intrinsics introduced in
D141924 to handle vectorization of interleaving groups with a
factor of 2 for scalable vectors.

Reviewed By: fhahn, reames

Differential Revision: https://reviews.llvm.org/D145163
  • Loading branch information
huntergr-arm committed Jun 9, 2023
1 parent 40052b0 commit 95bfb19
Show file tree
Hide file tree
Showing 7 changed files with 461 additions and 403 deletions.
19 changes: 12 additions & 7 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -14729,9 +14729,11 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
unsigned AArch64TargetLowering::getNumInterleavedAccesses(
VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
unsigned VecSize = 128;
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
if (UseScalable)
VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize);
return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
}

MachineMemOperand::Flags
Expand All @@ -14745,29 +14747,32 @@ AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
bool AArch64TargetLowering::isLegalInterleavedAccessType(
VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {

unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
auto EC = VecTy->getElementCount();
unsigned MinElts = EC.getKnownMinValue();

UseScalable = false;

// Ensure that the predicate for this number of elements is available.
if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(NumElements))
if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
return false;

// Ensure the number of vector elements is greater than 1.
if (NumElements < 2)
if (MinElts < 2)
return false;

// Ensure the element type is legal.
if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
return false;

if (EC.isScalable())
return MinElts * ElSize == 128;

unsigned VecSize = DL.getTypeSizeInBits(VecTy);
if (Subtarget->forceStreamingCompatibleSVE() ||
(Subtarget->useSVEForFixedLengthVectors() &&
(VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
(VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
isPowerOf2_32(NumElements) && VecSize > 128)))) {
isPowerOf2_32(MinElts) && VecSize > 128)))) {
UseScalable = true;
return true;
}
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Expand Up @@ -2814,19 +2814,23 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecVTy = cast<FixedVectorType>(VecTy);
auto *VecVTy = cast<VectorType>(VecTy);

if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
return InstructionCost::getInvalid();

if (!UseMaskForCond && !UseMaskForGaps &&
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecVTy->getNumElements();
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
auto *SubVecTy =
FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
VectorType::get(VecVTy->getElementType(),
VecVTy->getElementCount().divideCoefficientBy(Factor));

// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
// matched to more than one ldN/stN instruction.
bool UseScalable;
if (NumElts % Factor == 0 &&
if (MinElts % Factor == 0 &&
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Expand Up @@ -445,6 +445,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
if (isa<ScalableVectorType>(VecTy))
return InstructionCost::getInvalid();
auto *FVTy = cast<FixedVectorType>(VecTy);
InstructionCost MemCost =
getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
Expand Down
108 changes: 87 additions & 21 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -439,6 +439,37 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
return std::nullopt;
}

/// Return a vector containing interleaved elements from multiple
/// smaller input vectors.
static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
const Twine &Name) {
unsigned Factor = Vals.size();
assert(Factor > 1 && "Tried to interleave invalid number of vectors");

VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
#ifndef NDEBUG
for (Value *Val : Vals)
assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
#endif

// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
return Builder.CreateIntrinsic(
WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
/*FMFSource=*/nullptr, Name);
}

// Fixed length. Start by concatenating all vectors into a wide vector.
Value *WideVec = concatenateVectors(Builder, Vals);

// Interleave the elements into the wide vector.
const unsigned NumElts = VecTy->getElementCount().getFixedValue();
return Builder.CreateShuffleVector(
WideVec, createInterleaveMask(NumElts, Factor), Name);
}

namespace {
// Forward declare GeneratedRTChecks.
class GeneratedRTChecks;
Expand Down Expand Up @@ -2586,7 +2617,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getLoadStoreType(Instr);
unsigned InterleaveFactor = Group->getFactor();
assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);

// Prepare for the new pointers.
Expand All @@ -2597,14 +2627,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
assert((!BlockInMask || !Group->isReverse()) &&
"Reversed masked interleave-group not supported.");

Value *Idx;
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
if (Group->isReverse())
Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
if (Group->isReverse()) {
Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
Idx = Builder.CreateNeg(Idx);
} else
Idx = Builder.getInt32(-Index);

for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
Expand All @@ -2625,8 +2662,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
InBounds = gep->isInBounds();
AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index),
"", InBounds);
AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);

// Cast to the vector pointer type.
unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
Expand Down Expand Up @@ -2676,6 +2712,41 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
NewLoads.push_back(NewLoad);
}

if (VecTy->isScalableTy()) {
assert(InterleaveFactor == 2 &&
"Unsupported deinterleave factor for scalable vectors");

for (unsigned Part = 0; Part < UF; ++Part) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
Value *DI = Builder.CreateIntrinsic(
Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
/*FMFSource=*/nullptr, "strided.vec");
unsigned J = 0;
for (unsigned I = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);

if (!Member)
continue;

Value *StridedVec = Builder.CreateExtractValue(DI, I);
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
}

if (Group->isReverse())
StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");

State.set(VPDefs[J], StridedVec, Part);
++J;
}
}

return;
}

// For each member in the group, shuffle out the appropriate data from the
// wide loads.
unsigned J = 0;
Expand Down Expand Up @@ -2749,14 +2820,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
StoredVecs.push_back(StoredVec);
}

// Concatenate all vectors into a wide vector.
Value *WideVec = concatenateVectors(Builder, StoredVecs);

// Interleave the elements in the wide vector.
Value *IVec = Builder.CreateShuffleVector(
WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
"interleaved.vec");

// Interleave all the smaller vectors into one wider vector.
Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask || MaskForGaps) {
Value *GroupMask = MaskForGaps;
Expand Down Expand Up @@ -6547,11 +6612,6 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) {
// TODO: Once we have support for interleaving with scalable vectors
// we can calculate the cost properly here.
if (VF.isScalable())
return InstructionCost::getInvalid();

Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
Expand Down Expand Up @@ -8859,9 +8919,15 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// single VPInterleaveRecipe.
for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
auto applyIG = [IG, this](ElementCount VF) -> bool {
return (VF.isVector() && // Query is illegal for VF == 1
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
bool Result = (VF.isVector() && // Query is illegal for VF == 1
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
// is 2 since we require the (de)interleave2 intrinsics instead of
// shufflevectors.
assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};
if (!getDecisionAndClampRange(applyIG, Range))
continue;
Expand Down

0 comments on commit 95bfb19

Please sign in to comment.