From ed75372a76eb173eb49370ffa10e4f820d8ee778 Mon Sep 17 00:00:00 2001 From: Sergey Kachkov Date: Fri, 7 Nov 2025 18:09:56 +0300 Subject: [PATCH] [VPlan] Implement compressed widening of memory instructions --- .../llvm/Analysis/TargetTransformInfo.h | 1 + .../Transforms/Vectorize/LoopVectorize.cpp | 24 ++++++++++---- llvm/lib/Transforms/Vectorize/VPlan.h | 32 ++++++++++++------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 +++++++++---- .../Transforms/Vectorize/VPlanTransforms.cpp | 11 ++++--- 5 files changed, 61 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0f17312b03827..e8769f5860c77 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1442,6 +1442,7 @@ class TargetTransformInfo { Normal, ///< The cast is used with a normal load/store. Masked, ///< The cast is used with a masked load/store. GatherScatter, ///< The cast is used with a gather/scatter. + Compressed, ///< The cast is used with an expand load/compress store. Interleave, ///< The cast is used with an interleaved load/store. Reversed, ///< The cast is used with a reversed load/store. }; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e069b2e8103e0..6565c8c036ca0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel { CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, + CM_Compressed, CM_Scalarize, CM_VectorCall, CM_IntrinsicCall @@ -3109,9 +3110,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (IsUniformMemOpUse(I)) return true; - return (WideningDecision == CM_Widen || - WideningDecision == CM_Widen_Reverse || - WideningDecision == CM_Interleave); + return ( + WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || + WideningDecision == CM_Interleave || WideningDecision == CM_Compressed); }; // Returns true if Ptr is the pointer operand of a memory access instruction @@ -5192,11 +5193,16 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost( Instruction *I, ElementCount VF, InstWidening Decision) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(toVectorTy(ValTy, VF)); + const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); + if (Decision == CM_Compressed) + return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy, + /*VariableMask*/ true, Alignment, + CostKind, I); + assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) && "Expected widen decision."); - const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, @@ -6300,6 +6306,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, switch (getWideningDecision(I, VF)) { case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; + case LoopVectorizationCostModel::CM_Compressed: + return TTI::CastContextHint::Compressed; case LoopVectorizationCostModel::CM_Interleave: return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: @@ -7515,8 +7523,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, Range.Start); bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; + bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed; bool Consecutive = - Reverse || Decision == LoopVectorizationCostModel::CM_Widen; + Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen; VPValue *Ptr = isa(I) ? Operands[0] : Operands[1]; if (Consecutive) { @@ -7546,11 +7555,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, } if (LoadInst *Load = dyn_cast(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, - VPIRMetadata(*Load, LVer), I->getDebugLoc()); + Compressed, VPIRMetadata(*Load, LVer), + I->getDebugLoc()); StoreInst *Store = cast(I); return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, VPIRMetadata(*Store, LVer), + Reverse, Compressed, VPIRMetadata(*Store, LVer), I->getDebugLoc()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index bbb03fbdff7a2..26256951a9c6c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3193,6 +3193,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, /// Whether the consecutive accessed addresses are in reverse order. bool Reverse; + /// Whether the consecutive accessed addresses are compressed with mask value. + bool Compressed; + /// Whether the memory access is masked. bool IsMasked = false; @@ -3206,12 +3209,13 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, std::initializer_list Operands, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I), Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive), - Reverse(Reverse) { + Reverse(Reverse), Compressed(Compressed) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); + assert((Consecutive || !Compressed) && "Compressed implies consecutive"); assert(isa(getAddr()) || !Reverse && "Reversed acccess without VPVectorEndPointerRecipe address?"); @@ -3241,6 +3245,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, /// order. bool isReverse() const { return Reverse; } + /// Return whether the consecutive loaded/stored addresses are compressed. + bool isCompressed() const { return Compressed; } + /// Return the address accessed by this recipe. VPValue *getAddr() const { return getOperand(0); } @@ -3274,18 +3281,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, + bool Consecutive, bool Reverse, bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, - Reverse, Metadata, DL), + Reverse, Compressed, Metadata, DL), VPValue(this, &Load) { setMask(Mask); } VPWidenLoadRecipe *clone() override { return new VPWidenLoadRecipe(cast(Ingredient), getAddr(), - getMask(), Consecutive, Reverse, *this, - getDebugLoc()); + getMask(), Consecutive, Reverse, Compressed, + *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); @@ -3316,8 +3323,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL, VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), - {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L, - L.getDebugLoc()), + {Addr, &EVL}, L.isConsecutive(), L.isReverse(), + L.isCompressed(), L, L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -3355,16 +3362,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe { VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, VPValue *Mask, bool Consecutive, bool Reverse, - const VPIRMetadata &Metadata, DebugLoc DL) + bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL) : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, - Consecutive, Reverse, Metadata, DL) { + Consecutive, Reverse, Compressed, Metadata, DL) { setMask(Mask); } VPWidenStoreRecipe *clone() override { return new VPWidenStoreRecipe(cast(Ingredient), getAddr(), getStoredValue(), getMask(), Consecutive, - Reverse, *this, getDebugLoc()); + Reverse, Compressed, *this, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); @@ -3399,7 +3406,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { VPValue *Mask) : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(), - S.isReverse(), S, S.getDebugLoc()) { + S.isReverse(), S.isCompressed(), S, + S.getDebugLoc()) { setMask(Mask); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 80cd112dbcd8a..0b0bd63ee2b28 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3565,8 +3565,12 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, InstructionCost Cost = 0; if (IsMasked) { - Cost += - Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); + Cost += Compressed + ? Ctx.TTI.getExpandCompressMemoryOpCost(Opcode, Ty, + /*VariableMask*/ true, + Alignment, Ctx.CostKind) + : Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, + Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( isa(this) ? getOperand(0) @@ -3603,9 +3607,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, "wide.masked.gather"); } else if (Mask) { - NewLI = - Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, - PoisonValue::get(DataTy), "wide.masked.load"); + NewLI = Compressed + ? Builder.CreateMaskedExpandLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.expand.load") + : Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.load"); } else { NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); } @@ -3732,7 +3740,10 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { if (CreateScatter) NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); else if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + NewSI = Compressed + ? Builder.CreateMaskedCompressStore(StoredVal, Addr, Alignment, + Mask) + : Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); else NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); applyMetadata(*NewSI); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 48bd697397f41..cdfbc531ebfa6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast(Inst)) { NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load), - Ingredient.getDebugLoc()); + false /*Consecutive*/, false /*Reverse*/, false /*Compressed*/, + VPIRMetadata(*Load), Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast(Inst)) { NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, - VPIRMetadata(*Store), Ingredient.getDebugLoc()); + false /*Compressed*/, VPIRMetadata(*Store), + Ingredient.getDebugLoc()); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); } else if (CallInst *CI = dyn_cast(Inst)) { @@ -4207,7 +4208,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl &NarrowedOps) { auto *LI = cast(LoadGroup->getInterleaveGroup()->getInsertPos()); auto *L = new VPWidenLoadRecipe( *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true, - /*Reverse=*/false, {}, LoadGroup->getDebugLoc()); + /*Reverse=*/false, /*Compressed=*/false, {}, LoadGroup->getDebugLoc()); L->insertBefore(LoadGroup); NarrowedOps.insert(L); return L; @@ -4344,7 +4345,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, cast(StoreGroup->getInterleaveGroup()->getInsertPos()); auto *S = new VPWidenStoreRecipe( *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true, - /*Reverse=*/false, {}, StoreGroup->getDebugLoc()); + /*Reverse=*/false, /*Compressed=*/false, {}, StoreGroup->getDebugLoc()); S->insertBefore(StoreGroup); StoreGroup->eraseFromParent(); }