-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[VPlan] Implement compressed widening of memory instructions #166956
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/skachkov-sc/widen-decision-refactor
Are you sure you want to change the base?
[VPlan] Implement compressed widening of memory instructions #166956
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-analysis Author: Sergey Kachkov (skachkov-sc) ChangesRFC link: https://discourse.llvm.org/t/rfc-loop-vectorization-of-compress-store-expand-load-patterns/86442 Full diff: https://github.com/llvm/llvm-project/pull/166956.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0f17312b03827..e8769f5860c77 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1442,6 +1442,7 @@ class TargetTransformInfo {
Normal, ///< The cast is used with a normal load/store.
Masked, ///< The cast is used with a masked load/store.
GatherScatter, ///< The cast is used with a gather/scatter.
+ Compressed, ///< The cast is used with an expand load/compress store.
Interleave, ///< The cast is used with an interleaved load/store.
Reversed, ///< The cast is used with a reversed load/store.
};
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 914018591d832..25e8a63eae9cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Compressed,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -3108,9 +3109,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (IsUniformMemOpUse(I))
return true;
- return (WideningDecision == CM_Widen ||
- WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Interleave);
+ return (
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Interleave || WideningDecision == CM_Compressed);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5191,12 +5192,17 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(
Instruction *I, ElementCount VF, InstWidening Decision) {
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ if (Decision == CM_Compressed)
+ return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy,
+ /*VariableMask*/ true, Alignment,
+ CostKind, I);
+
assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
"Expected widen decision.");
- const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
if (Legal->isMaskRequired(I)) {
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6299,6 +6305,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
switch (getWideningDecision(I, VF)) {
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
+ case LoopVectorizationCostModel::CM_Compressed:
+ return TTI::CastContextHint::Compressed;
case LoopVectorizationCostModel::CM_Interleave:
return TTI::CastContextHint::Interleave;
case LoopVectorizationCostModel::CM_Scalarize:
@@ -7514,8 +7522,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+ bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
bool Consecutive =
- Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -7545,11 +7554,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ Compressed, VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, VPIRMetadata(*Store, LVer),
+ Reverse, Compressed, VPIRMetadata(*Store, LVer),
I->getDebugLoc());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bbb03fbdff7a2..26256951a9c6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3193,6 +3193,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
+ /// Whether the consecutive accessed addresses are compressed with mask value.
+ bool Compressed;
+
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -3206,12 +3209,13 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, bool Compressed,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
- Reverse(Reverse) {
+ Reverse(Reverse), Compressed(Compressed) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+ assert((Consecutive || !Compressed) && "Compressed implies consecutive");
assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
!Reverse &&
"Reversed acccess without VPVectorEndPointerRecipe address?");
@@ -3241,6 +3245,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// order.
bool isReverse() const { return Reverse; }
+ /// Return whether the consecutive loaded/stored addresses are compressed.
+ bool isCompressed() const { return Compressed; }
+
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -3274,18 +3281,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, bool Compressed,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, Metadata, DL),
+ Reverse, Compressed, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, *this,
- getDebugLoc());
+ getMask(), Consecutive, Reverse, Compressed,
+ *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3316,8 +3323,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
- {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
- L.getDebugLoc()),
+ {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
+ L.isCompressed(), L, L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3355,16 +3362,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
VPValue *Mask, bool Consecutive, bool Reverse,
- const VPIRMetadata &Metadata, DebugLoc DL)
+ bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, Metadata, DL) {
+ Consecutive, Reverse, Compressed, Metadata, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, *this, getDebugLoc());
+ Reverse, Compressed, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3399,7 +3406,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
- S.isReverse(), S, S.getDebugLoc()) {
+ S.isReverse(), S.isCompressed(), S,
+ S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 80cd112dbcd8a..0b0bd63ee2b28 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3565,8 +3565,12 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
InstructionCost Cost = 0;
if (IsMasked) {
- Cost +=
- Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
+ Cost += Compressed
+ ? Ctx.TTI.getExpandCompressMemoryOpCost(Opcode, Ty,
+ /*VariableMask*/ true,
+ Alignment, Ctx.CostKind)
+ : Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS,
+ Ctx.CostKind);
} else {
TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
@@ -3603,9 +3607,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- NewLI =
- Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy), "wide.masked.load");
+ NewLI = Compressed
+ ? Builder.CreateMaskedExpandLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.expand.load")
+ : Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -3732,7 +3740,10 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
if (CreateScatter)
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
else if (Mask)
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ NewSI = Compressed
+ ? Builder.CreateMaskedCompressStore(StoredVal, Addr, Alignment,
+ Mask)
+ : Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
else
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
applyMetadata(*NewSI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 48bd697397f41..cdfbc531ebfa6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
- Ingredient.getDebugLoc());
+ false /*Consecutive*/, false /*Reverse*/, false /*Compressed*/,
+ VPIRMetadata(*Load), Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- VPIRMetadata(*Store), Ingredient.getDebugLoc());
+ false /*Compressed*/, VPIRMetadata(*Store),
+ Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -4207,7 +4208,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
auto *L = new VPWidenLoadRecipe(
*LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+ /*Reverse=*/false, /*Compressed=*/false, {}, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
NarrowedOps.insert(L);
return L;
@@ -4344,7 +4345,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
auto *S = new VPWidenStoreRecipe(
*SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
+ /*Reverse=*/false, /*Compressed=*/false, {}, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
|
|
@llvm/pr-subscribers-vectorizers Author: Sergey Kachkov (skachkov-sc) ChangesRFC link: https://discourse.llvm.org/t/rfc-loop-vectorization-of-compress-store-expand-load-patterns/86442 Full diff: https://github.com/llvm/llvm-project/pull/166956.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0f17312b03827..e8769f5860c77 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1442,6 +1442,7 @@ class TargetTransformInfo {
Normal, ///< The cast is used with a normal load/store.
Masked, ///< The cast is used with a masked load/store.
GatherScatter, ///< The cast is used with a gather/scatter.
+ Compressed, ///< The cast is used with an expand load/compress store.
Interleave, ///< The cast is used with an interleaved load/store.
Reversed, ///< The cast is used with a reversed load/store.
};
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 914018591d832..25e8a63eae9cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
+ CM_Compressed,
CM_Scalarize,
CM_VectorCall,
CM_IntrinsicCall
@@ -3108,9 +3109,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (IsUniformMemOpUse(I))
return true;
- return (WideningDecision == CM_Widen ||
- WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Interleave);
+ return (
+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Interleave || WideningDecision == CM_Compressed);
};
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5191,12 +5192,17 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(
Instruction *I, ElementCount VF, InstWidening Decision) {
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ if (Decision == CM_Compressed)
+ return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy,
+ /*VariableMask*/ true, Alignment,
+ CostKind, I);
+
assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
"Expected widen decision.");
- const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
if (Legal->isMaskRequired(I)) {
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6299,6 +6305,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
switch (getWideningDecision(I, VF)) {
case LoopVectorizationCostModel::CM_GatherScatter:
return TTI::CastContextHint::GatherScatter;
+ case LoopVectorizationCostModel::CM_Compressed:
+ return TTI::CastContextHint::Compressed;
case LoopVectorizationCostModel::CM_Interleave:
return TTI::CastContextHint::Interleave;
case LoopVectorizationCostModel::CM_Scalarize:
@@ -7514,8 +7522,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+ bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
bool Consecutive =
- Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -7545,11 +7554,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ Compressed, VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, VPIRMetadata(*Store, LVer),
+ Reverse, Compressed, VPIRMetadata(*Store, LVer),
I->getDebugLoc());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bbb03fbdff7a2..26256951a9c6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3193,6 +3193,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// Whether the consecutive accessed addresses are in reverse order.
bool Reverse;
+ /// Whether the consecutive accessed addresses are compressed with mask value.
+ bool Compressed;
+
/// Whether the memory access is masked.
bool IsMasked = false;
@@ -3206,12 +3209,13 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
std::initializer_list<VPValue *> Operands,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, bool Compressed,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
- Reverse(Reverse) {
+ Reverse(Reverse), Compressed(Compressed) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+ assert((Consecutive || !Compressed) && "Compressed implies consecutive");
assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
!Reverse &&
"Reversed acccess without VPVectorEndPointerRecipe address?");
@@ -3241,6 +3245,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
/// order.
bool isReverse() const { return Reverse; }
+ /// Return whether the consecutive loaded/stored addresses are compressed.
+ bool isCompressed() const { return Compressed; }
+
/// Return the address accessed by this recipe.
VPValue *getAddr() const { return getOperand(0); }
@@ -3274,18 +3281,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
- bool Consecutive, bool Reverse,
+ bool Consecutive, bool Reverse, bool Compressed,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
- Reverse, Metadata, DL),
+ Reverse, Compressed, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}
VPWidenLoadRecipe *clone() override {
return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
- getMask(), Consecutive, Reverse, *this,
- getDebugLoc());
+ getMask(), Consecutive, Reverse, Compressed,
+ *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3316,8 +3323,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
- {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
- L.getDebugLoc()),
+ {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
+ L.isCompressed(), L, L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3355,16 +3362,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
VPValue *Mask, bool Consecutive, bool Reverse,
- const VPIRMetadata &Metadata, DebugLoc DL)
+ bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
- Consecutive, Reverse, Metadata, DL) {
+ Consecutive, Reverse, Compressed, Metadata, DL) {
setMask(Mask);
}
VPWidenStoreRecipe *clone() override {
return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
getStoredValue(), getMask(), Consecutive,
- Reverse, *this, getDebugLoc());
+ Reverse, Compressed, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3399,7 +3406,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
- S.isReverse(), S, S.getDebugLoc()) {
+ S.isReverse(), S.isCompressed(), S,
+ S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 80cd112dbcd8a..0b0bd63ee2b28 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3565,8 +3565,12 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
InstructionCost Cost = 0;
if (IsMasked) {
- Cost +=
- Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
+ Cost += Compressed
+ ? Ctx.TTI.getExpandCompressMemoryOpCost(Opcode, Ty,
+ /*VariableMask*/ true,
+ Alignment, Ctx.CostKind)
+ : Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS,
+ Ctx.CostKind);
} else {
TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
@@ -3603,9 +3607,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
"wide.masked.gather");
} else if (Mask) {
- NewLI =
- Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
- PoisonValue::get(DataTy), "wide.masked.load");
+ NewLI = Compressed
+ ? Builder.CreateMaskedExpandLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.expand.load")
+ : Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+ PoisonValue::get(DataTy),
+ "wide.masked.load");
} else {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
@@ -3732,7 +3740,10 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
if (CreateScatter)
NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
else if (Mask)
- NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+ NewSI = Compressed
+ ? Builder.CreateMaskedCompressStore(StoredVal, Addr, Alignment,
+ Mask)
+ : Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
else
NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
applyMetadata(*NewSI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 48bd697397f41..cdfbc531ebfa6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
NewRecipe = new VPWidenLoadRecipe(
*Load, Ingredient.getOperand(0), nullptr /*Mask*/,
- false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
- Ingredient.getDebugLoc());
+ false /*Consecutive*/, false /*Reverse*/, false /*Compressed*/,
+ VPIRMetadata(*Load), Ingredient.getDebugLoc());
} else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
NewRecipe = new VPWidenStoreRecipe(
*Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
- VPIRMetadata(*Store), Ingredient.getDebugLoc());
+ false /*Compressed*/, VPIRMetadata(*Store),
+ Ingredient.getDebugLoc());
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -4207,7 +4208,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
auto *L = new VPWidenLoadRecipe(
*LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
- /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+ /*Reverse=*/false, /*Compressed=*/false, {}, LoadGroup->getDebugLoc());
L->insertBefore(LoadGroup);
NarrowedOps.insert(L);
return L;
@@ -4344,7 +4345,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
auto *S = new VPWidenStoreRecipe(
*SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
- /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
+ /*Reverse=*/false, /*Compressed=*/false, {}, StoreGroup->getDebugLoc());
S->insertBefore(StoreGroup);
StoreGroup->eraseFromParent();
}
|
3bb26ad to
f39e2a0
Compare
9697cd8 to
92342e0
Compare
f39e2a0 to
369c0b8
Compare
92342e0 to
ed75372
Compare
RFC link: https://discourse.llvm.org/t/rfc-loop-vectorization-of-compress-store-expand-load-patterns/86442
This patch extends VPWidenMemoryRecipe with "Compressed" property, which generates expand loads/compress stores. Compressed there implies that the access is masked and consecutive at the same time (the mask is used to shuffle vector elements which were loaded or stored consecutively from memory). The overall diagram of generating widened memory operations will look like this:

This patch omits support of legacy (not VPlan-based) cost model for expandloads/compressstores; this will be required at some point (LoopVectorizer has checks that VPlan-based and legacy cost models take the same vectorization decisions)