-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV][AArch64]: Utilise SVE ld4/st4 instructions via auto-vectorisation #89018
base: main
Are you sure you want to change the base?
Conversation
Change-Id: Ibf9dbc4ed31ad32aa603479ba067b488a50308e5
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-backend-aarch64 Author: Hassnaa Hamdi (hassnaaHamdi) Changes
Patch is 32.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89018.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..e233d430e98dd5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -59,6 +59,8 @@
#include <string>
#include <utility>
#include <vector>
+#include <stack>
+#include <queue>
namespace llvm {
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..73c3a63b61da3b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -71,6 +71,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -510,12 +511,52 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
+ std::queue<std::pair<unsigned, Value*>> LeafNodes;
+ std::map<IntrinsicInst*, bool>mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ unsigned DILeafCount = 0;
+ while(!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ bool RootFound = false;
+ for (auto UserExtract : CurrentDI->users()) { // iterate over extract users of deinterleave
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ for (auto UserDI : UserExtract->users()) { // iterate over deinterleave users of extract
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI ||
+ Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ RootFound = true;
+ LeafNodes.push(std::make_pair(DILeafCount, UserExtract));
+ TempDeadInsts.push_back(Extract);
+ }
+ else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ if (RootFound)
+ DILeafCount += CurrentDI->getNumUses();
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -531,14 +572,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
+ std::queue<Value*> LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while(!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for(unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ LeafNodes.push(op);
+ }
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -559,7 +619,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..ab8c01e2df5a9a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, llvm::Value*>>& LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,8 +16409,27 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ while (!LeafNodes.empty()) {
+ unsigned ExtractIndex = LeafNodes.front().first;
+ llvm::Value* CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices())
+ NewIndices.push_back(index + ExtractIndex);
+
+ Value *extrc =Builder.CreateExtractValue(Result, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
+ }
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16420,15 +16439,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, std::queue<Value*>& LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,8 +16492,16 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+ if (UseScalable) {
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ }
else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..85497a1f7ae41a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e80931a03f30b6..35150928f0adb0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3315,15 +3315,17 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecVTy = cast<VectorType>(VecTy);
- if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
- return InstructionCost::getInvalid();
+ unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+ if (VecTy->isScalableTy() &&
+ (!ST->hasSVE() || Factor > MaxFactor))
+ return InstructionCost::getInvalid();
// Vectorization for masked interleaved accesses is only enabled for scalable
// VF.
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ if (!UseMaskForGaps && Factor <= MaxFactor) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
auto *SubVecTy =
VectorType::get(VecVTy->getElementType(),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..64e0a2bb1f2942 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21064,6 +21066,27 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ unsigned ExtractIndex = 0;
+ while (!LeafNodes.empty()) {
+ ExtractIndex = LeafNodes.front().first;
+ auto CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices()) {
+ NewIndices.push_back(index + ExtractIndex);
+ }
+ Value *extrc = Builder.CreateExtractValue(Vlseg, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21075,6 +21098,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21083,10 +21107,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21112,6 +21136,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..1f104cf3bc15d5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2057cab46135ff..41f8c5a72ce1e7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -154,6 +154,7 @@
#include <string>
#include <tuple>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -459,10 +460,23 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(
- WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
- /*FMFSource=*/nullptr, Name);
+ SmallVector<Value *> Vecs(Vals);
+ unsigned AllNodesNum = (2*Vals.size()) - 1;
+ // last element in the vec should be the final interleaved result,
+ // so, skip processing last element.
+ AllNodesNum --;
+ // interleave each 2 consecutive nodes, and push result to the vec,
+ // so that we can interleave the interleaved results again if we have
+ // more than 2 vectors to interleave.
+ for (unsigned i = 0; i < AllNodesNum; i +=2) {
+ VectorType *VecTy = cast<VectorType>(Vecs[i]->getType());
+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+ auto InterleavedVec = Builder.CreateIntrinsic(
+ WideVecTy, Intrinsic::experimental_vector_interleave2,
+ {Vecs[i], Vecs[i+1]}, /*FMFSource=*/nullptr, Name);
+ Vecs.push_back(InterleavedVec);
+ }
+ return Vecs[Vecs.size()-1];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
unsigned Part, Value *MaskForGaps) -> Value * {
if (VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *BlockInMaskPart = State.get(BlockInMask, Part);
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
@@ -2572,23 +2586,40 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
- "Unsupported deinterleave factor for scalable vectors");
-
+ assert(isPowerOf2_32(InterleaveFactor) &&
+ "Unsupported deinterleave factor for scalable vectors");
for (unsigned Part = 0; Part < UF; ++Part) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
- Value *DI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
- /*FMFSource=*/nullptr, "strided.vec");
+
+ std::queue<Value *>Queue;
+ Queue.push(NewLoads[Part]);
+ // NonLeaf represents how many times we will do deinterleaving,
+ // think of it as a tree, each node will be deinterleaved, untill we reach to
+ // the leaf nodes which will be the final results of deinterleaving.
+ unsigned NonLeaf = InterleaveFactor - 1;
+ for (unsigned i = 0; i < NonLeaf; i ++) {
+ auto Node = Queue.front();
+ Queue.pop();
+ auto DeinterleaveType = Node->getType();
+ Value *DI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_deinterleave2, DeinterleaveType, Node,
+ /*FMFSource=*/nullptr, "root.strided.vec");
+ Value *StridedVec1 = Builder.CreateExtractValue(DI, 0);
+ Value *Strid...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Hassnaa Hamdi (hassnaaHamdi) Changes
Patch is 32.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89018.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..e233d430e98dd5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -59,6 +59,8 @@
#include <string>
#include <utility>
#include <vector>
+#include <stack>
+#include <queue>
namespace llvm {
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..73c3a63b61da3b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -71,6 +71,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -510,12 +511,52 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
+ std::queue<std::pair<unsigned, Value*>> LeafNodes;
+ std::map<IntrinsicInst*, bool>mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ unsigned DILeafCount = 0;
+ while(!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ bool RootFound = false;
+ for (auto UserExtract : CurrentDI->users()) { // iterate over extract users of deinterleave
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ for (auto UserDI : UserExtract->users()) { // iterate over deinterleave users of extract
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI ||
+ Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ RootFound = true;
+ LeafNodes.push(std::make_pair(DILeafCount, UserExtract));
+ TempDeadInsts.push_back(Extract);
+ }
+ else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ if (RootFound)
+ DILeafCount += CurrentDI->getNumUses();
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -531,14 +572,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
+ std::queue<Value*> LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while(!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for(unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ LeafNodes.push(op);
+ }
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -559,7 +619,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..ab8c01e2df5a9a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, llvm::Value*>>& LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,8 +16409,27 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ while (!LeafNodes.empty()) {
+ unsigned ExtractIndex = LeafNodes.front().first;
+ llvm::Value* CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices())
+ NewIndices.push_back(index + ExtractIndex);
+
+ Value *extrc =Builder.CreateExtractValue(Result, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
+ }
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16420,15 +16439,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, std::queue<Value*>& LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,8 +16492,16 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+ if (UseScalable) {
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ }
else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..85497a1f7ae41a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e80931a03f30b6..35150928f0adb0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3315,15 +3315,17 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecVTy = cast<VectorType>(VecTy);
- if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
- return InstructionCost::getInvalid();
+ unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+ if (VecTy->isScalableTy() &&
+ (!ST->hasSVE() || Factor > MaxFactor))
+ return InstructionCost::getInvalid();
// Vectorization for masked interleaved accesses is only enabled for scalable
// VF.
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ if (!UseMaskForGaps && Factor <= MaxFactor) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
auto *SubVecTy =
VectorType::get(VecVTy->getElementType(),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..64e0a2bb1f2942 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21064,6 +21066,27 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ unsigned ExtractIndex = 0;
+ while (!LeafNodes.empty()) {
+ ExtractIndex = LeafNodes.front().first;
+ auto CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices()) {
+ NewIndices.push_back(index + ExtractIndex);
+ }
+ Value *extrc = Builder.CreateExtractValue(Vlseg, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21075,6 +21098,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21083,10 +21107,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21112,6 +21136,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..1f104cf3bc15d5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2057cab46135ff..41f8c5a72ce1e7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -154,6 +154,7 @@
#include <string>
#include <tuple>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -459,10 +460,23 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(
- WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
- /*FMFSource=*/nullptr, Name);
+ SmallVector<Value *> Vecs(Vals);
+ unsigned AllNodesNum = (2*Vals.size()) - 1;
+ // last element in the vec should be the final interleaved result,
+ // so, skip processing last element.
+ AllNodesNum --;
+ // interleave each 2 consecutive nodes, and push result to the vec,
+ // so that we can interleave the interleaved results again if we have
+ // more than 2 vectors to interleave.
+ for (unsigned i = 0; i < AllNodesNum; i +=2) {
+ VectorType *VecTy = cast<VectorType>(Vecs[i]->getType());
+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+ auto InterleavedVec = Builder.CreateIntrinsic(
+ WideVecTy, Intrinsic::experimental_vector_interleave2,
+ {Vecs[i], Vecs[i+1]}, /*FMFSource=*/nullptr, Name);
+ Vecs.push_back(InterleavedVec);
+ }
+ return Vecs[Vecs.size()-1];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
unsigned Part, Value *MaskForGaps) -> Value * {
if (VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *BlockInMaskPart = State.get(BlockInMask, Part);
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
@@ -2572,23 +2586,40 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
- "Unsupported deinterleave factor for scalable vectors");
-
+ assert(isPowerOf2_32(InterleaveFactor) &&
+ "Unsupported deinterleave factor for scalable vectors");
for (unsigned Part = 0; Part < UF; ++Part) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
- Value *DI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
- /*FMFSource=*/nullptr, "strided.vec");
+
+ std::queue<Value *>Queue;
+ Queue.push(NewLoads[Part]);
+ // NonLeaf represents how many times we will do deinterleaving,
+ // think of it as a tree, each node will be deinterleaved, untill we reach to
+ // the leaf nodes which will be the final results of deinterleaving.
+ unsigned NonLeaf = InterleaveFactor - 1;
+ for (unsigned i = 0; i < NonLeaf; i ++) {
+ auto Node = Queue.front();
+ Queue.pop();
+ auto DeinterleaveType = Node->getType();
+ Value *DI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_deinterleave2, DeinterleaveType, Node,
+ /*FMFSource=*/nullptr, "root.strided.vec");
+ Value *StridedVec1 = Builder.CreateExtractValue(DI, 0);
+ Value *Strid...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 8e0a4a89f940d17b520bbca040981f54195d3ea4 4629ab0d5b252d03d090d541179350a596048460 -- llvm/include/llvm/CodeGen/TargetLowering.h llvm/lib/CodeGen/InterleavedAccessPass.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.h llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp llvm/lib/Target/RISCV/RISCVISelLowering.cpp llvm/lib/Target/RISCV/RISCVISelLowering.h llvm/lib/Transforms/Vectorize/LoopVectorize.cpp View the diff from clang-format here.diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e233d430e9..7d3e97a9f7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -56,11 +56,11 @@
#include <cstdint>
#include <iterator>
#include <map>
+#include <queue>
+#include <stack>
#include <string>
#include <utility>
#include <vector>
-#include <stack>
-#include <queue>
namespace llvm {
@@ -3146,9 +3146,9 @@ public:
///
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
- virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- std::queue<std::pair<unsigned, Value*>>& LeafNodes,
- LoadInst *LI) const {
+ virtual bool lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, Value *>> &LeafNodes,
+ LoadInst *LI) const {
return false;
}
@@ -3159,7 +3159,7 @@ public:
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- std::queue<Value*>& LeafNodes,
+ std::queue<Value *> &LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 73c3a63b61..e4e53b9b66 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,8 +70,8 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
-#include <utility>
#include <queue>
+#include <utility>
using namespace llvm;
@@ -511,28 +511,30 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
- std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
- std::queue<std::pair<unsigned, Value*>> LeafNodes;
- std::map<IntrinsicInst*, bool>mp;
+ std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
+ std::queue<std::pair<unsigned, Value *>> LeafNodes;
+ std::map<IntrinsicInst *, bool> mp;
SmallVector<Instruction *> TempDeadInsts;
DeinterleaveTreeQueue.push(DI);
unsigned DILeafCount = 0;
- while(!DeinterleaveTreeQueue.empty()) {
+ while (!DeinterleaveTreeQueue.empty()) {
auto CurrentDI = DeinterleaveTreeQueue.top();
DeinterleaveTreeQueue.pop();
TempDeadInsts.push_back(CurrentDI);
bool RootFound = false;
- for (auto UserExtract : CurrentDI->users()) { // iterate over extract users of deinterleave
+ for (auto UserExtract :
+ CurrentDI->users()) { // iterate over extract users of deinterleave
Instruction *Extract = dyn_cast<Instruction>(UserExtract);
if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
continue;
bool IsLeaf = true;
- for (auto UserDI : UserExtract->users()) { // iterate over deinterleave users of extract
+ for (auto UserDI :
+ UserExtract->users()) { // iterate over deinterleave users of extract
IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
- if (!Child_DI ||
- Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
- continue;
+ if (!Child_DI || Child_DI->getIntrinsicID() !=
+ Intrinsic::experimental_vector_deinterleave2)
+ continue;
IsLeaf = false;
if (mp.count(Child_DI) == 0) {
DeinterleaveTreeQueue.push(Child_DI);
@@ -543,8 +545,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
RootFound = true;
LeafNodes.push(std::make_pair(DILeafCount, UserExtract));
TempDeadInsts.push_back(Extract);
- }
- else {
+ } else {
TempDeadInsts.push_back(Extract);
}
}
@@ -556,7 +557,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
+ TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -572,20 +574,21 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
- std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
- std::queue<Value*> LeafNodes;
+ std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
+ std::queue<Value *> LeafNodes;
SmallVector<Instruction *> TempDeadInsts;
IeinterleaveTreeQueue.push(II);
- while(!IeinterleaveTreeQueue.empty()) {
+ while (!IeinterleaveTreeQueue.empty()) {
auto node = IeinterleaveTreeQueue.front();
TempDeadInsts.push_back(node);
IeinterleaveTreeQueue.pop();
- for(unsigned i = 0; i < 2; i++) {
+ for (unsigned i = 0; i < 2; i++) {
auto op = node->getOperand(i);
- if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
- if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
- continue;
+ if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() !=
+ Intrinsic::experimental_vector_interleave2)
+ continue;
IeinterleaveTreeQueue.push(CurrentII);
continue;
}
@@ -619,7 +622,8 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() ==
+ Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ab8c01e2df..be9b72f4b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,18 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, std::queue<std::pair<unsigned, llvm::Value*>>& LeafNodes, LoadInst *LI) const {
+ IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, llvm::Value *>> &LeafNodes,
+ LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
- cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0)
+ ? cast<VectorType>(LeafNodes.front().second->getType())
+ : cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16417,20 +16420,20 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
while (!LeafNodes.empty()) {
unsigned ExtractIndex = LeafNodes.front().first;
- llvm::Value* CurrentExtract = LeafNodes.front().second;
+ llvm::Value *CurrentExtract = LeafNodes.front().second;
LeafNodes.pop();
- ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
-
+ ExtractValueInst *ExtractValueInst =
+ dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+
SmallVector<unsigned, 4> NewIndices;
for (auto index : ExtractValueInst->indices())
NewIndices.push_back(index + ExtractIndex);
- Value *extrc =Builder.CreateExtractValue(Result, NewIndices);
+ Value *extrc = Builder.CreateExtractValue(Result, NewIndices);
CurrentExtract->replaceAllUsesWith(extrc);
}
return true;
- }
- else
+ } else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16439,7 +16442,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, std::queue<Value*>& LeafNodes, StoreInst *SI) const {
+ IntrinsicInst *II, std::queue<Value *> &LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
@@ -16501,8 +16504,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Args.push_back(Pred);
Args.push_back(Address);
Builder.CreateCall(StNFunc, Args);
- }
- else
+ } else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 85497a1f7a..d114f462d6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -682,12 +682,12 @@ public:
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- std::queue<std::pair<unsigned, Value*>>& LeafNodes,
- LoadInst *LI) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, Value *>> &LeafNodes,
+ LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- std::queue<Value*>& LeafNodes,
+ std::queue<Value *> &LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 35150928f0..51fe96b5cf 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3315,10 +3315,9 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecVTy = cast<VectorType>(VecTy);
- unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor();
- if (VecTy->isScalableTy() &&
- (!ST->hasSVE() || Factor > MaxFactor))
- return InstructionCost::getInvalid();
+ unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+ if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor > MaxFactor))
+ return InstructionCost::getInvalid();
// Vectorization for masked interleaved accesses is only enabled for scalable
// VF.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 64e0a2bb1f..f98fbc581c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21024,9 +21024,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- std::queue<std::pair<unsigned, Value*>>& LeafNodes,
- LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, Value *>> &LeafNodes,
+ LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21037,8 +21037,10 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
- cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy =
+ (LeafNodes.size() > 0)
+ ? cast<VectorType>(LeafNodes.front().second->getType())
+ : cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21078,7 +21080,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
ExtractIndex = LeafNodes.front().first;
auto CurrentExtract = LeafNodes.front().second;
LeafNodes.pop();
- ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+ ExtractValueInst *ExtractValueInst =
+ dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
SmallVector<unsigned, 4> NewIndices;
for (auto index : ExtractValueInst->indices()) {
NewIndices.push_back(index + ExtractIndex);
@@ -21097,9 +21100,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
return true;
}
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- std::queue<Value*>& LeafNodes,
- StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, std::queue<Value *> &LeafNodes, StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21137,14 +21139,14 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
SmallVector<Value *> Args;
- while (!LeafNodes.empty()) {
- Args.push_back(LeafNodes.front());
- LeafNodes.pop();
- }
- Args.push_back(SI->getPointerOperand());
- Args.push_back(VL);
- Builder.CreateCall(VssegNFunc, Args);
- return true;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 1f104cf3bc..3c16dcd9ae 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,12 +855,12 @@ public:
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- std::queue<std::pair<unsigned, Value*>>& LeafNodes,
- LoadInst *LI) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, Value *>> &LeafNodes,
+ LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- std::queue<Value*>& LeafNodes,
+ std::queue<Value *> &LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 41f8c5a72c..7a2c7e3f8f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -151,10 +151,10 @@
#include <limits>
#include <map>
#include <memory>
+#include <queue>
#include <string>
#include <tuple>
#include <utility>
-#include <queue>
using namespace llvm;
@@ -461,22 +461,22 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
SmallVector<Value *> Vecs(Vals);
- unsigned AllNodesNum = (2*Vals.size()) - 1;
+ unsigned AllNodesNum = (2 * Vals.size()) - 1;
// last element in the vec should be the final interleaved result,
// so, skip processing last element.
- AllNodesNum --;
+ AllNodesNum--;
// interleave each 2 consecutive nodes, and push result to the vec,
// so that we can interleave the interleaved results again if we have
// more than 2 vectors to interleave.
- for (unsigned i = 0; i < AllNodesNum; i +=2) {
+ for (unsigned i = 0; i < AllNodesNum; i += 2) {
VectorType *VecTy = cast<VectorType>(Vecs[i]->getType());
VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
auto InterleavedVec = Builder.CreateIntrinsic(
- WideVecTy, Intrinsic::experimental_vector_interleave2,
- {Vecs[i], Vecs[i+1]}, /*FMFSource=*/nullptr, Name);
+ WideVecTy, Intrinsic::experimental_vector_interleave2,
+ {Vecs[i], Vecs[i + 1]}, /*FMFSource=*/nullptr, Name);
Vecs.push_back(InterleavedVec);
}
- return Vecs[Vecs.size()-1];
+ return Vecs[Vecs.size() - 1];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2533,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
unsigned Part, Value *MaskForGaps) -> Value * {
if (VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(isPowerOf2_32(InterleaveFactor) &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *BlockInMaskPart = State.get(BlockInMask, Part);
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
@@ -2586,25 +2586,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (VecTy->isScalableTy()) {
- assert(isPowerOf2_32(InterleaveFactor) &&
- "Unsupported deinterleave factor for scalable vectors");
+ assert(isPowerOf2_32(InterleaveFactor) &&
+ "Unsupported deinterleave factor for scalable vectors");
for (unsigned Part = 0; Part < UF; ++Part) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
-
- std::queue<Value *>Queue;
+
+ std::queue<Value *> Queue;
Queue.push(NewLoads[Part]);
// NonLeaf represents how many times we will do deinterleaving,
- // think of it as a tree, each node will be deinterleaved, untill we reach to
- // the leaf nodes which will be the final results of deinterleaving.
+ // think of it as a tree, each node will be deinterleaved, untill we
+ // reach to the leaf nodes which will be the final results of
+ // deinterleaving.
unsigned NonLeaf = InterleaveFactor - 1;
- for (unsigned i = 0; i < NonLeaf; i ++) {
+ for (unsigned i = 0; i < NonLeaf; i++) {
auto Node = Queue.front();
Queue.pop();
auto DeinterleaveType = Node->getType();
Value *DI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_deinterleave2, DeinterleaveType, Node,
- /*FMFSource=*/nullptr, "root.strided.vec");
+ Intrinsic::experimental_vector_deinterleave2, DeinterleaveType,
+ Node,
+ /*FMFSource=*/nullptr, "root.strided.vec");
Value *StridedVec1 = Builder.CreateExtractValue(DI, 0);
Value *StridedVec2 = Builder.CreateExtractValue(DI, 1);
Queue.push(StridedVec1);
@@ -2712,7 +2714,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
// Interleave all the smaller vectors into one wider vector.
Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
- //LLVM_DEBUG(dbgs() << "interleaved vec: "; IVec->dump());
+ // LLVM_DEBUG(dbgs() << "interleaved vec: "; IVec->dump());
Instruction *NewStoreInstr;
if (BlockInMask || MaskForGaps) {
Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
@@ -8691,8 +8693,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
- // is a (power of 2) factor, since we require the (de)interleave2 intrinsics instead of
- // shufflevectors, so we can do (de)interleave2 recursively.
+ // is a (power of 2) factor, since we require the (de)interleave2
+ // intrinsics instead of shufflevectors, so we can do (de)interleave2
+ // recursively.
assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
"Unsupported interleave factor for scalable vectors");
return Result;
|
It loos like at the TargetLowering, LV and InterleavedAccessPass changes could be decoupled? |
@@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( | |||
unsigned Part, Value *MaskForGaps) -> Value * { | |||
if (VF.isScalable()) { | |||
assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); | |||
assert(InterleaveFactor == 2 && | |||
assert(isPowerOf2_32(InterleaveFactor) && | |||
"Unsupported deinterleave factor for scalable vectors"); | |||
auto *BlockInMaskPart = State.get(BlockInMask, Part); | |||
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The mask of masked interleaved accesses also requires an interleave tree to generate the correct mask.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please give an example for a case that uses masked interleaved accesses ?
I have commented the code of creating masked load (the call to CreateGroupMask lambda function), and reran the tests but all tests ran successfully. It seems that for the interleaved accesses all the loads are aligned not masked.
// think of it as a tree, each node will be deinterleaved, untill we reach to | ||
// the leaf nodes which will be the final results of deinterleaving. | ||
unsigned NonLeaf = InterleaveFactor - 1; | ||
for (unsigned i = 0; i < NonLeaf; i ++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i
--> I
i ++
--> I++
auto StridedVec = Queue.front(); | ||
Queue.pop(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here is a example:
A vector 0 1 2 3 4 5 6 7
If we do deinterleave 4 on the vector, we should get:
member 0: 0 4
member 1: 1 5
member 2: 2 6
member 3: 3 7
But the Queue in your change may like: 0 4, 2 6, 1 5, 3 7.
Please confirm the Queue is sorted by a correct rank.
@@ -2681,6 +2712,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( | |||
|
|||
// Interleave all the smaller vectors into one wider vector. | |||
Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); | |||
//LLVM_DEBUG(dbgs() << "interleaved vec: "; IVec->dump()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove it.
I agree. Please can you split this PR into two PRs with the first only concerned with the correct lowering of the new emulated ld4/st4 IR sequence. The second PR then teaches LoopVectorize how to use/generate them. |
Hi - Is there a plan for how to handle ld3? We have seen a lot of issues recently with the canonical shuffle representation for fixed-vector ld2/ld3/ld4, and I was wondering if it made sense to move away from shuffles for fixed-length too. |
There is but that will require a new intrinsic. My hope is that rather than having an intrinsic per interleave factor we could model them all using interleave2 and interleave3 (once it's created). This is why we've started with ld4/st4 support to see if there are any pitfalls to this approach. Personally I'd love us to move to using these intrinsics for all vector types because it will streamline several code paths. |
Doing deinterleaving as trees sort of makes sense for high interleaving factors... I've seen loops that benefit from deinterleaving with interleave factors as high as 12. I'm a little concerned the abstraction layers here are going to make cost modeling less accurate, though; ideally, the vectorizer should be able to estimate the cost of an ld4. |
RISC-V has interleave loads for up to 8. So I guess we would need interleave5 and interleave7? |
Yes, sorry. I guess I meant "Hopefully we can emulate all required interleave factors by only implement specific intrinsics for factors that are a prime number"? An alternative proposal is to have intrinsics for all but then lower them to sequences of fewer intrinsics within the InterleavedAccess pass or perhaps even SelectionDAGBuilder. I suppose this really depends on how awkward cost modelling the sequences turns out to be. @efriedma-quic - Is your concern related to vectorisation or the costing of already vectorised code? |
Given the way the pass pipeline is structured, cost modeling in the vectorizer itself tends to be more important than modeling in subsequent passes. I guess maybe it's not a big deal what the vectorizer generates if the vectorizer itself has some way to get the correct numbers. |
The loop vectorizer will produce costs via getInterleavedMemoryOpCost so should be fine as far as I understand. If there are no combines later on (either uncosted in instcombine or costed in vector-combine) that work with vector.interleave/vector.deinterleave then they can break the canonical patterns that the backend is expecting to generate ld2/ld4 from. I'm hoping that if we can move to interleave/deinterleave, that should fix some of the problems we have at the moment. I have recently been adding costs for the existing shuffles we find for fixed length vectors, in an attempt to reduce the number of times we break apart the load+shuffle (or store+shuffle), and have to either attempt to repair it or fall back to worse generation in the backend. I would say that in general costing for single-instructions is fine, two instructions making a pattern (like shuffle(load) or store(shuffle)) are do-able but start to get unreliable, and three-instruction plus becomes difficult to cost well. |
The LoopVectorize can't use scalable vectors to vectorize it,
because SV have to use intrinsics to deinterleave,
BUT (de)interleave4 is not available.
then the vectorizer could use SV.
it will be deinterleaved into { <vscale x 8 x i32>, <8 x 16 x i32>},
then each extracted vector: <vscale x 8 x i32> will be deinterleaved into { <vscale x 4 x i32>, <vscale x 4 i32> },
so the final result would be: { <vscale x 4 x i32>, <vscale x 4 i32>, <vscale x 4 x i32>, <vscale x 4 i32> },
which is the same result if we could use deinterleave4.