Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3232,6 +3232,9 @@ class LLVM_ABI TargetLoweringBase {
/// Default to be the minimum interleave factor: 2.
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }

/// Return true if the target has interleave with shuffles.
virtual bool hasInterleaveWithGatherScatter() const { return false; }

/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
unsigned MaxFactor) {
unsigned MaxFactor,
bool InterleaveWithShuffles) {
unsigned NumElts = SVI->getShuffleMask().size();
if (NumElts < 4)
return false;
Expand All @@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
return true;
}

if (InterleaveWithShuffles) {
for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
Factor = i * MaxFactor;
if (SVI->isInterleave(Factor))
return true;
}
}
return false;
}

Expand Down Expand Up @@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
cast<FixedVectorType>(SVI->getType())->getNumElements();
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
if (!isReInterleaveMask(SVI, Factor, MaxFactor,
TLI->hasInterleaveWithGatherScatter()))
return false;
assert(NumStoredElements % Factor == 0 &&
"number of stored element should be a multiple of Factor");
Expand Down
163 changes: 160 additions & 3 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
#include <cctype>
#include <cstdint>
#include <cstdlib>
#include <deque>
#include <iterator>
#include <limits>
#include <optional>
Expand Down Expand Up @@ -18144,12 +18145,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
ShuffleVectorInst *SVI,
unsigned Factor,
const APInt &GapMask) const {

assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(Factor >= 2 && "Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;

if (Factor > getMaxSupportedInterleaveFactor())
return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);

assert(!LaneMask && GapMask.popcount() == Factor &&
"Unexpected mask on store");

Expand Down Expand Up @@ -18295,6 +18298,160 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
return true;
}

/// If the interleaved vector elements are greater than supported MaxFactor,
/// interleaving the data with additional shuffles can be used to
/// achieve the same.
///
/// Consider the following data with 8 interleaves which are shuffled to store
/// stN instructions. Data needs to be stored in this order:
/// [v0, v1, v2, v3, v4, v5, v6, v7]
///
/// v0 v4 v2 v6 v1 v5 v3 v7
/// | | | | | | | |
/// \ / \ / \ / \ /
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4
/// | | | |
/// \ / \ /
/// \ / \ /
/// \ / \ /
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
///
/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
/// another st4.
///
/// For stN = 2, upper half of interleaved data V0, V1 is stored
/// with one st2 instruction. Second set V2, V3 is stored with another st2.
/// Total of 4 st2's are required here.
bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();

auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");

unsigned LaneLen = VecTy->getNumElements() / Factor;
Type *EltTy = VecTy->getElementType();
auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);

const DataLayout &DL = SI->getModule()->getDataLayout();
bool UseScalable;

// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() ||
!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
return false;

if (UseScalable)
return false;

std::deque<Value *> Shuffles;
// If Only one operand is there in root shuffle.
if (isa<PoisonValue>(SVI->getOperand(1)) &&
SVI->getType() == SVI->getOperand(0)->getType()) {
Value *Op0 = SVI->getOperand(0);
Shuffles.push_back(dyn_cast<Value>(Op0));
} else
Shuffles.push_back(SVI);
unsigned ConcatLevel = Factor;
unsigned ConcatElt = Factor * LaneLen;
// Getting all the interleaved operands.
while (ConcatLevel > 1) {
unsigned InterleavedOperands = Shuffles.size();
for (unsigned Ops = 0; Ops < InterleavedOperands; Ops++) {
auto *V = Shuffles.front();
Shuffles.pop_front();
if (isa<ConstantAggregateZero, PoisonValue>(V)) {
VectorType *Ty = cast<VectorType>(V->getType());
auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
Value *SplitValue = nullptr;
if (isa<ConstantAggregateZero>(V))
SplitValue = ConstantAggregateZero::get(HalfTy);
else
SplitValue = PoisonValue::get(HalfTy);

Shuffles.push_back(SplitValue);
Shuffles.push_back(SplitValue);
continue;
}
if (V->getType() == SubVecTy) {
Shuffles.push_back(V);
continue;
}
ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(V);
if (!SFL)
return false;
if (SVI != SFL && !SFL->isIdentityMask(SFL->getShuffleMask(), ConcatElt))
return false;

Value *Op0 = SFL->getOperand(0);
Value *Op1 = SFL->getOperand(1);

Shuffles.push_back(dyn_cast<Value>(Op0));
Shuffles.push_back(dyn_cast<Value>(Op1));
}
ConcatLevel >>= 1;
ConcatElt >>= 1;
}

IRBuilder<> Builder(SI);
auto Mask = createInterleaveMask(LaneLen, 2);
SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
for (unsigned Idx = 0; Idx < LaneLen; Idx++) {
LowerHalfMask[Idx] = Mask[Idx];
UpperHalfMask[Idx] = Mask[Idx + LaneLen];
}

unsigned InterleaveFactor = Factor >> 1;
while (InterleaveFactor >= MaxSupportedFactor) {
std::deque<Value *> ShufflesIntermediate;
ShufflesIntermediate.resize(Factor);
for (unsigned Idx = 0; Idx < Factor; Idx += (InterleaveFactor * 2)) {
for (unsigned GroupIdx = 0; GroupIdx < InterleaveFactor; GroupIdx++) {
assert(Shuffles[Idx + GroupIdx]->getType() == SubVecTy &&
Shuffles[Idx + GroupIdx + InterleaveFactor]->getType() ==
SubVecTy &&
"Type of interleaving candidates are not matching\n");
auto *Shuffle = Builder.CreateShuffleVector(
Shuffles[Idx + GroupIdx],
Shuffles[Idx + GroupIdx + InterleaveFactor], LowerHalfMask);
ShufflesIntermediate[Idx + GroupIdx] = Shuffle;
Shuffle = Builder.CreateShuffleVector(
Shuffles[Idx + GroupIdx],
Shuffles[Idx + GroupIdx + InterleaveFactor], UpperHalfMask);
ShufflesIntermediate[Idx + GroupIdx + InterleaveFactor] = Shuffle;
}
}
Shuffles = ShufflesIntermediate;
InterleaveFactor >>= 1;
}

Type *PtrTy = SI->getPointerOperandType();
auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);

Value *BaseAddr = SI->getPointerOperand();
Function *StNFunc = getStructuredStoreFunction(
SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
for (unsigned N = 0; N < (Factor / MaxSupportedFactor); N++) {
SmallVector<Value *, 5> Ops;
for (unsigned OpIdx = 0; OpIdx < MaxSupportedFactor; OpIdx++)
Ops.push_back(Shuffles[N * MaxSupportedFactor + OpIdx]);

if (N > 0) {
// We will compute the pointer operand of each store from the original
// base address using GEPs. Cast the base address to a pointer to the
// scalar element type.
BaseAddr = Builder.CreateConstGEP1_32(
SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
}
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
Builder.CreateCall(StNFunc, Ops);
}
return true;
}

bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,8 @@ class AArch64TargetLowering : public TargetLowering {

bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;

bool hasInterleaveWithGatherScatter() const override { return true; }

unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
Expand All @@ -239,6 +241,9 @@ class AArch64TargetLowering : public TargetLowering {
ShuffleVectorInst *SVI, unsigned Factor,
const APInt &GapMask) const override;

bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const;

bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
IntrinsicInst *DI) const override;

Expand Down
30 changes: 25 additions & 5 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4932,19 +4932,39 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();

if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumLoadStores = 1;
InstructionCost ShuffleCost = 0;
bool isInterleaveWithShuffle = false;
unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();

auto *SubVecTy =
VectorType::get(VecVTy->getElementType(),
VecVTy->getElementCount().divideCoefficientBy(Factor));

if (TLI->hasInterleaveWithGatherScatter() && Opcode == Instruction::Store &&
(0 == Factor % MaxSupportedFactor) && Factor > MaxSupportedFactor) {
isInterleaveWithShuffle = true;

NumLoadStores = Factor / MaxSupportedFactor;
ShuffleCost =
(Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
{}, CostKind, 0, SubVecTy));
}

if (!UseMaskForGaps &&
(Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
auto *SubVecTy =
VectorType::get(VecVTy->getElementType(),
VecVTy->getElementCount().divideCoefficientBy(Factor));

// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
// matched to more than one ldN/stN instruction.
bool UseScalable;
if (MinElts % Factor == 0 &&
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
return (Factor *
TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
NumLoadStores) +
ShuffleCost;
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Expand Down
Loading