diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 3497ff7856eed..28648315df201 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1470,6 +1470,8 @@ QualType Sema::CheckNonTypeTemplateParameterType(QualType T, T->isLValueReferenceType() || // -- pointer to member, T->isMemberPointerType() || + // -- block pointer, + T->isBlockPointerType() || // -- std::nullptr_t, or T->isNullPtrType() || // -- a type that contains a placeholder type. @@ -7342,7 +7344,7 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType, // For a non-type template-parameter of pointer or reference type, // the value of the constant expression shall not refer to assert(ParamType->isPointerOrReferenceType() || - ParamType->isNullPtrType()); + ParamType->isNullPtrType() || ParamType->isBlockPointerType()); // -- a temporary object // -- a string literal // -- the result of a typeid expression, or diff --git a/clang/test/SemaCXX/gh189247.cpp b/clang/test/SemaCXX/gh189247.cpp new file mode 100644 index 0000000000000..79f7b312aee83 --- /dev/null +++ b/clang/test/SemaCXX/gh189247.cpp @@ -0,0 +1,5 @@ +// RUN: %clang_cc1 -fblocks -fsyntax-only -verify %s +// expected-no-diagnostics + +template struct T; +T *t; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp deleted file mode 100644 index 6c7bedaf2c933..0000000000000 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ /dev/null @@ -1,4435 +0,0 @@ -//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file contains implementations for different VPlan recipes. -/// -//===----------------------------------------------------------------------===// - -#include "LoopVectorizationPlanner.h" -#include "VPlan.h" -#include "VPlanAnalysis.h" -#include "VPlanHelpers.h" -#include "VPlanPatternMatch.h" -#include "VPlanUtils.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/IVDescriptors.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include - -using namespace llvm; -using namespace llvm::VPlanPatternMatch; - -using VectorParts = SmallVector; - -#define LV_NAME "loop-vectorize" -#define DEBUG_TYPE LV_NAME - -bool VPRecipeBase::mayWriteToMemory() const { - switch (getVPDefID()) { - case VPExpressionSC: - return cast(this)->mayReadOrWriteMemory(); - case VPInstructionSC: { - auto *VPI = cast(this); - // Loads read from memory but don't write to memory. - if (VPI->getOpcode() == Instruction::Load) - return false; - return VPI->opcodeMayReadOrWriteFromMemory(); - } - case VPInterleaveEVLSC: - case VPInterleaveSC: - return cast(this)->getNumStoreOperands() > 0; - case VPWidenStoreEVLSC: - case VPWidenStoreSC: - return true; - case VPReplicateSC: - return cast(getVPSingleValue()->getUnderlyingValue()) - ->mayWriteToMemory(); - case VPWidenCallSC: - return !cast(this) - ->getCalledScalarFunction() - ->onlyReadsMemory(); - case VPWidenIntrinsicSC: - return cast(this)->mayWriteToMemory(); - case VPCanonicalIVPHISC: - case VPBranchOnMaskSC: - case VPDerivedIVSC: - case VPFirstOrderRecurrencePHISC: - case VPReductionPHISC: - case VPScalarIVStepsSC: - case VPPredInstPHISC: - return false; - case VPBlendSC: - case VPReductionEVLSC: - case VPReductionSC: - case VPVectorPointerSC: - case VPWidenCanonicalIVSC: - case VPWidenCastSC: - case VPWidenGEPSC: - case VPWidenIntOrFpInductionSC: - case VPWidenLoadEVLSC: - case VPWidenLoadSC: - case VPWidenPHISC: - case VPWidenPointerInductionSC: - case VPWidenSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayWriteToMemory()) && - "underlying instruction may write to memory"); - return false; - } - default: - return true; - } -} - -bool VPRecipeBase::mayReadFromMemory() const { - switch (getVPDefID()) { - case VPExpressionSC: - return cast(this)->mayReadOrWriteMemory(); - case VPInstructionSC: - return cast(this)->opcodeMayReadOrWriteFromMemory(); - case VPWidenLoadEVLSC: - case VPWidenLoadSC: - return true; - case VPReplicateSC: - return cast(getVPSingleValue()->getUnderlyingValue()) - ->mayReadFromMemory(); - case VPWidenCallSC: - return !cast(this) - ->getCalledScalarFunction() - ->onlyWritesMemory(); - case VPWidenIntrinsicSC: - return cast(this)->mayReadFromMemory(); - case VPBranchOnMaskSC: - case VPDerivedIVSC: - case VPFirstOrderRecurrencePHISC: - case VPPredInstPHISC: - case VPScalarIVStepsSC: - case VPWidenStoreEVLSC: - case VPWidenStoreSC: - return false; - case VPBlendSC: - case VPReductionEVLSC: - case VPReductionSC: - case VPVectorPointerSC: - case VPWidenCanonicalIVSC: - case VPWidenCastSC: - case VPWidenGEPSC: - case VPWidenIntOrFpInductionSC: - case VPWidenPHISC: - case VPWidenPointerInductionSC: - case VPWidenSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayReadFromMemory()) && - "underlying instruction may read from memory"); - return false; - } - default: - // FIXME: Return false if the recipe represents an interleaved store. - return true; - } -} - -bool VPRecipeBase::mayHaveSideEffects() const { - switch (getVPDefID()) { - case VPExpressionSC: - return cast(this)->mayHaveSideEffects(); - case VPDerivedIVSC: - case VPFirstOrderRecurrencePHISC: - case VPPredInstPHISC: - case VPVectorEndPointerSC: - return false; - case VPInstructionSC: { - auto *VPI = cast(this); - return mayWriteToMemory() || - VPI->getOpcode() == VPInstruction::BranchOnCount || - VPI->getOpcode() == VPInstruction::BranchOnCond; - } - case VPWidenCallSC: { - Function *Fn = cast(this)->getCalledScalarFunction(); - return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn(); - } - case VPWidenIntrinsicSC: - return cast(this)->mayHaveSideEffects(); - case VPBlendSC: - case VPReductionEVLSC: - case VPReductionSC: - case VPScalarIVStepsSC: - case VPVectorPointerSC: - case VPWidenCanonicalIVSC: - case VPWidenCastSC: - case VPWidenGEPSC: - case VPWidenIntOrFpInductionSC: - case VPWidenPHISC: - case VPWidenPointerInductionSC: - case VPWidenSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayHaveSideEffects()) && - "underlying instruction has side-effects"); - return false; - } - case VPInterleaveEVLSC: - case VPInterleaveSC: - return mayWriteToMemory(); - case VPWidenLoadEVLSC: - case VPWidenLoadSC: - case VPWidenStoreEVLSC: - case VPWidenStoreSC: - assert( - cast(this)->getIngredient().mayHaveSideEffects() == - mayWriteToMemory() && - "mayHaveSideffects result for ingredient differs from this " - "implementation"); - return mayWriteToMemory(); - case VPReplicateSC: { - auto *R = cast(this); - return R->getUnderlyingInstr()->mayHaveSideEffects(); - } - default: - return true; - } -} - -void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { - assert(!Parent && "Recipe already in some VPBasicBlock"); - assert(InsertPos->getParent() && - "Insertion position not in any VPBasicBlock"); - InsertPos->getParent()->insert(this, InsertPos->getIterator()); -} - -void VPRecipeBase::insertBefore(VPBasicBlock &BB, - iplist::iterator I) { - assert(!Parent && "Recipe already in some VPBasicBlock"); - assert(I == BB.end() || I->getParent() == &BB); - BB.insert(this, I); -} - -void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { - assert(!Parent && "Recipe already in some VPBasicBlock"); - assert(InsertPos->getParent() && - "Insertion position not in any VPBasicBlock"); - InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator())); -} - -void VPRecipeBase::removeFromParent() { - assert(getParent() && "Recipe not in any VPBasicBlock"); - getParent()->getRecipeList().remove(getIterator()); - Parent = nullptr; -} - -iplist::iterator VPRecipeBase::eraseFromParent() { - assert(getParent() && "Recipe not in any VPBasicBlock"); - return getParent()->getRecipeList().erase(getIterator()); -} - -void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { - removeFromParent(); - insertAfter(InsertPos); -} - -void VPRecipeBase::moveBefore(VPBasicBlock &BB, - iplist::iterator I) { - removeFromParent(); - insertBefore(BB, I); -} - -InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { - // Get the underlying instruction for the recipe, if there is one. It is used - // to - // * decide if cost computation should be skipped for this recipe, - // * apply forced target instruction cost. - Instruction *UI = nullptr; - if (auto *S = dyn_cast(this)) - UI = dyn_cast_or_null(S->getUnderlyingValue()); - else if (auto *IG = dyn_cast(this)) - UI = IG->getInsertPos(); - else if (auto *WidenMem = dyn_cast(this)) - UI = &WidenMem->getIngredient(); - - InstructionCost RecipeCost; - if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { - RecipeCost = 0; - } else { - RecipeCost = computeCost(VF, Ctx); - if (ForceTargetInstructionCost.getNumOccurrences() > 0 && - RecipeCost.isValid()) { - if (UI) - RecipeCost = InstructionCost(ForceTargetInstructionCost); - else - RecipeCost = InstructionCost(0); - } - } - - LLVM_DEBUG({ - dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; - dump(); - }); - return RecipeCost; -} - -InstructionCost VPRecipeBase::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - llvm_unreachable("subclasses should implement computeCost"); -} - -bool VPRecipeBase::isPhi() const { - return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) || - isa(this); -} - -bool VPRecipeBase::isScalarCast() const { - auto *VPI = dyn_cast(this); - return VPI && Instruction::isCast(VPI->getOpcode()); -} - -void VPIRFlags::intersectFlags(const VPIRFlags &Other) { - assert(OpType == Other.OpType && "OpType must match"); - switch (OpType) { - case OperationType::OverflowingBinOp: - WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; - WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; - break; - case OperationType::Trunc: - TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; - TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; - break; - case OperationType::DisjointOp: - DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; - break; - case OperationType::PossiblyExactOp: - ExactFlags.IsExact &= Other.ExactFlags.IsExact; - break; - case OperationType::GEPOp: - GEPFlags &= Other.GEPFlags; - break; - case OperationType::FPMathOp: - case OperationType::FCmp: - assert((OpType != OperationType::FCmp || - FCmpFlags.Pred == Other.FCmpFlags.Pred) && - "Cannot drop CmpPredicate"); - getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs; - getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs; - break; - case OperationType::NonNegOp: - NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; - break; - case OperationType::Cmp: - assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate"); - break; - case OperationType::Other: - assert(AllFlags == Other.AllFlags && "Cannot drop other flags"); - break; - } -} - -FastMathFlags VPIRFlags::getFastMathFlags() const { - assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp) && - "recipe doesn't have fast math flags"); - const FastMathFlagsTy &F = getFMFsRef(); - FastMathFlags Res; - Res.setAllowReassoc(F.AllowReassoc); - Res.setNoNaNs(F.NoNaNs); - Res.setNoInfs(F.NoInfs); - Res.setNoSignedZeros(F.NoSignedZeros); - Res.setAllowReciprocal(F.AllowReciprocal); - Res.setAllowContract(F.AllowContract); - Res.setApproxFunc(F.ApproxFunc); - return Res; -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPSingleDefRecipe::dump() const { VPDef::dump(); } - -void VPRecipeBase::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - printRecipe(O, Indent, SlotTracker); - if (auto DL = getDebugLoc()) { - O << ", !dbg "; - DL.print(O); - } - - if (auto *Metadata = dyn_cast(this)) - Metadata->print(O, SlotTracker); -} -#endif - -template -VPValue * -VPUnrollPartAccessor::getUnrollPartOperand(const VPUser &U) const { - if (U.getNumOperands() == PartOpIdx + 1) - return U.getOperand(PartOpIdx); - return nullptr; -} - -template -unsigned VPUnrollPartAccessor::getUnrollPart(const VPUser &U) const { - if (auto *UnrollPartOp = getUnrollPartOperand(U)) - return cast(UnrollPartOp->getLiveInIRValue())->getZExtValue(); - return 0; -} - -namespace llvm { -template class VPUnrollPartAccessor<1>; -template class VPUnrollPartAccessor<2>; -template class VPUnrollPartAccessor<3>; -} - -VPInstruction::VPInstruction(unsigned Opcode, ArrayRef Operands, - const VPIRFlags &Flags, const VPIRMetadata &MD, - DebugLoc DL, const Twine &Name) - : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) { - assert(flagsValidForOpcode(getOpcode()) && - "Set flags not supported for the provided opcode"); - assert((getNumOperandsForOpcode(Opcode) == -1u || - getNumOperandsForOpcode(Opcode) == getNumOperands()) && - "number of operands does not match opcode"); -} - -#ifndef NDEBUG -unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { - if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode)) - return 1; - - if (Instruction::isBinaryOp(Opcode)) - return 2; - - switch (Opcode) { - case VPInstruction::StepVector: - case VPInstruction::VScale: - return 0; - case Instruction::Alloca: - case Instruction::ExtractValue: - case Instruction::Freeze: - case Instruction::Load: - case VPInstruction::BranchOnCond: - case VPInstruction::Broadcast: - case VPInstruction::BuildStructVector: - case VPInstruction::BuildVector: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::ExplicitVectorLength: - case VPInstruction::ExtractLastLane: - case VPInstruction::ExtractLastPart: - case VPInstruction::ExtractPenultimateElement: - case VPInstruction::Not: - case VPInstruction::ResumeForEpilogue: - case VPInstruction::Unpack: - return 1; - case Instruction::ICmp: - case Instruction::FCmp: - case Instruction::ExtractElement: - case Instruction::Store: - case VPInstruction::BranchOnCount: - case VPInstruction::ComputeReductionResult: - case VPInstruction::ExtractLane: - case VPInstruction::FirstOrderRecurrenceSplice: - case VPInstruction::LogicalAnd: - case VPInstruction::PtrAdd: - case VPInstruction::WidePtrAdd: - case VPInstruction::WideIVStep: - return 2; - case Instruction::Select: - case VPInstruction::ActiveLaneMask: - case VPInstruction::ComputeAnyOfResult: - case VPInstruction::ReductionStartVector: - return 3; - case VPInstruction::ComputeFindIVResult: - return 4; - case Instruction::Call: - case Instruction::GetElementPtr: - case Instruction::PHI: - case Instruction::Switch: - case VPInstruction::AnyOf: - case VPInstruction::FirstActiveLane: - case VPInstruction::LastActiveLane: - case VPInstruction::SLPLoad: - case VPInstruction::SLPStore: - // Cannot determine the number of operands from the opcode. - return -1u; - } - llvm_unreachable("all cases should be handled above"); -} -#endif - -bool VPInstruction::doesGeneratePerAllLanes() const { - return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this); -} - -bool VPInstruction::canGenerateScalarForFirstLane() const { - if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) - return true; - if (isSingleScalar() || isVectorToScalar()) - return true; - switch (Opcode) { - case Instruction::Freeze: - case Instruction::ICmp: - case Instruction::PHI: - case Instruction::Select: - case VPInstruction::BranchOnCond: - case VPInstruction::BranchOnCount: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::PtrAdd: - case VPInstruction::ExplicitVectorLength: - case VPInstruction::AnyOf: - case VPInstruction::Not: - return true; - default: - return false; - } -} - -/// Create a conditional branch using \p Cond branching to the successors of \p -/// VPBB. Note that the first successor is always forward (i.e. not created yet) -/// while the second successor may already have been created (if it is a header -/// block and VPBB is a latch). -static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB, - VPTransformState &State) { - // Replace the temporary unreachable terminator with a new conditional - // branch, hooking it up to backward destination (header) for latch blocks - // now, and to forward destination(s) later when they are created. - // Second successor may be backwards - iff it is already in VPBB2IRBB. - VPBasicBlock *SecondVPSucc = cast(VPBB->getSuccessors()[1]); - BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc); - BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB]; - BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc); - // First successor is always forward, reset it to nullptr - CondBr->setSuccessor(0, nullptr); - IRBB->getTerminator()->eraseFromParent(); - return CondBr; -} - -Value *VPInstruction::generate(VPTransformState &State) { - IRBuilderBase &Builder = State.Builder; - - if (Instruction::isBinaryOp(getOpcode())) { - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); - Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); - auto *Res = - Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); - if (auto *I = dyn_cast(Res)) - applyFlags(*I); - return Res; - } - - switch (getOpcode()) { - case VPInstruction::Not: { - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); - return Builder.CreateNot(A, Name); - } - case Instruction::ExtractElement: { - assert(State.VF.isVector() && "Only extract elements from vectors"); - if (getOperand(1)->isLiveIn()) { - unsigned IdxToExtract = - cast(getOperand(1)->getLiveInIRValue())->getZExtValue(); - return State.get(getOperand(0), VPLane(IdxToExtract)); - } - Value *Vec = State.get(getOperand(0)); - Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); - return Builder.CreateExtractElement(Vec, Idx, Name); - } - case Instruction::Freeze: { - Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); - return Builder.CreateFreeze(Op, Name); - } - case Instruction::FCmp: - case Instruction::ICmp: { - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); - Value *B = State.get(getOperand(1), OnlyFirstLaneUsed); - return Builder.CreateCmp(getPredicate(), A, B, Name); - } - case Instruction::PHI: { - llvm_unreachable("should be handled by VPPhi::execute"); - } - case Instruction::Select: { - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Cond = - State.get(getOperand(0), - OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0))); - Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); - Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); - return Builder.CreateSelect(Cond, Op1, Op2, Name); - } - case VPInstruction::ActiveLaneMask: { - // Get first lane of vector induction variable. - Value *VIVElem0 = State.get(getOperand(0), VPLane(0)); - // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), VPLane(0)); - - // If this part of the active lane mask is scalar, generate the CMP directly - // to avoid unnecessary extracts. - if (State.VF.isScalar()) - return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, - Name); - - ElementCount EC = State.VF.multiplyCoefficientBy( - cast(getOperand(2)->getLiveInIRValue())->getZExtValue()); - auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC); - return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, - {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, Name); - } - case VPInstruction::FirstOrderRecurrenceSplice: { - // Generate code to combine the previous and current values in vector v3. - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - - auto *V1 = State.get(getOperand(0)); - if (!V1->getType()->isVectorTy()) - return V1; - Value *V2 = State.get(getOperand(1)); - return Builder.CreateVectorSplice(V1, V2, -1, Name); - } - case VPInstruction::CalculateTripCountMinusVF: { - unsigned UF = getParent()->getPlan()->getUF(); - Value *ScalarTC = State.get(getOperand(0), VPLane(0)); - Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF); - Value *Sub = Builder.CreateSub(ScalarTC, Step); - Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step); - Value *Zero = ConstantInt::getNullValue(ScalarTC->getType()); - return Builder.CreateSelect(Cmp, Sub, Zero); - } - case VPInstruction::ExplicitVectorLength: { - // TODO: Restructure this code with an explicit remainder loop, vsetvli can - // be outside of the main loop. - Value *AVL = State.get(getOperand(0), /*IsScalar*/ true); - // Compute EVL - assert(AVL->getType()->isIntegerTy() && - "Requested vector length should be an integer."); - - assert(State.VF.isScalable() && "Expected scalable vector factor."); - Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue()); - - Value *EVL = Builder.CreateIntrinsic( - Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, - {AVL, VFArg, Builder.getTrue()}); - return EVL; - } - case VPInstruction::CanonicalIVIncrementForPart: { - unsigned Part = getUnrollPart(*this); - auto *IV = State.get(getOperand(0), VPLane(0)); - assert(Part != 0 && "Must have a positive part"); - // The canonical IV is incremented by the vectorization factor (num of - // SIMD elements) times the unroll part. - Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); - return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(), - hasNoSignedWrap()); - } - case VPInstruction::BranchOnCond: { - Value *Cond = State.get(getOperand(0), VPLane(0)); - auto *Br = createCondBranch(Cond, getParent(), State); - applyMetadata(*Br); - return Br; - } - case VPInstruction::BranchOnCount: { - // First create the compare. - Value *IV = State.get(getOperand(0), /*IsScalar*/ true); - Value *TC = State.get(getOperand(1), /*IsScalar*/ true); - Value *Cond = Builder.CreateICmpEQ(IV, TC); - return createCondBranch(Cond, getParent(), State); - } - case VPInstruction::Broadcast: { - return Builder.CreateVectorSplat( - State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast"); - } - case VPInstruction::BuildStructVector: { - // For struct types, we need to build a new 'wide' struct type, where each - // element is widened, i.e., we create a struct of vectors. - auto *StructTy = - cast(State.TypeAnalysis.inferScalarType(getOperand(0))); - Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF)); - for (const auto &[LaneIndex, Op] : enumerate(operands())) { - for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements(); - FieldIndex++) { - Value *ScalarValue = - Builder.CreateExtractValue(State.get(Op, true), FieldIndex); - Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex); - VectorValue = - Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex); - Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex); - } - } - return Res; - } - case VPInstruction::BuildVector: { - auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0)); - auto NumOfElements = ElementCount::getFixed(getNumOperands()); - Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements)); - for (const auto &[Idx, Op] : enumerate(operands())) - Res = Builder.CreateInsertElement(Res, State.get(Op, true), - Builder.getInt32(Idx)); - return Res; - } - case VPInstruction::ReductionStartVector: { - if (State.VF.isScalar()) - return State.get(getOperand(0), true); - IRBuilderBase::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(getFastMathFlags()); - // If this start vector is scaled then it should produce a vector with fewer - // elements than the VF. - ElementCount VF = State.VF.divideCoefficientBy( - cast(getOperand(2)->getLiveInIRValue())->getZExtValue()); - auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true)); - return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true), - Builder.getInt32(0)); - } - case VPInstruction::ComputeAnyOfResult: { - // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary - // and will be removed by breaking up the recipe further. - auto *PhiR = cast(getOperand(0)); - auto *OrigPhi = cast(PhiR->getUnderlyingValue()); - Value *ReducedPartRdx = State.get(getOperand(2)); - for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx) - ReducedPartRdx = - Builder.CreateBinOp(Instruction::Or, State.get(getOperand(Idx)), - ReducedPartRdx, "bin.rdx"); - return createAnyOfReduction(Builder, ReducedPartRdx, - State.get(getOperand(1), VPLane(0)), OrigPhi); - } - case VPInstruction::ComputeFindIVResult: { - // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary - // and will be removed by breaking up the recipe further. - auto *PhiR = cast(getOperand(0)); - // Get its reduction variable descriptor. - RecurKind RK = PhiR->getRecurrenceKind(); - assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && - "Unexpected reduction kind"); - assert(!PhiR->isInLoop() && - "In-loop FindLastIV reduction is not supported yet"); - - // The recipe's operands are the reduction phi, the start value, the - // sentinel value, followed by one operand for each part of the reduction. - unsigned UF = getNumOperands() - 3; - Value *ReducedPartRdx = State.get(getOperand(3)); - RecurKind MinMaxKind; - bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK); - if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) - MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax; - else - MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin; - for (unsigned Part = 1; Part < UF; ++Part) - ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, - State.get(getOperand(3 + Part))); - - Value *Start = State.get(getOperand(1), true); - Value *Sentinel = getOperand(2)->getLiveInIRValue(); - return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start, - Sentinel); - } - case VPInstruction::ComputeReductionResult: { - // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary - // and will be removed by breaking up the recipe further. - auto *PhiR = cast(getOperand(0)); - // Get its reduction variable descriptor. - - RecurKind RK = PhiR->getRecurrenceKind(); - assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && - "should be handled by ComputeFindIVResult"); - - // The recipe's operands are the reduction phi, followed by one operand for - // each part of the reduction. - unsigned UF = getNumOperands() - 1; - VectorParts RdxParts(UF); - for (unsigned Part = 0; Part < UF; ++Part) - RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop()); - - IRBuilderBase::FastMathFlagGuard FMFG(Builder); - if (hasFastMathFlags()) - Builder.setFastMathFlags(getFastMathFlags()); - - // Reduce all of the unrolled parts into a single vector. - Value *ReducedPartRdx = RdxParts[0]; - if (PhiR->isOrdered()) { - ReducedPartRdx = RdxParts[UF - 1]; - } else { - // Floating-point operations should have some FMF to enable the reduction. - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = RdxParts[Part]; - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); - else { - // For sub-recurrences, each UF's reduction variable is already - // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1) - Instruction::BinaryOps Opcode = - RK == RecurKind::Sub - ? Instruction::Add - : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK); - ReducedPartRdx = - Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx"); - } - } - } - - // Create the reduction after the loop. Note that inloop reductions create - // the target reduction in the loop using a Reduction recipe. - if (State.VF.isVector() && !PhiR->isInLoop()) { - // TODO: Support in-order reductions based on the recurrence descriptor. - // All ops in the reduction inherit fast-math-flags from the recurrence - // descriptor. - ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK); - } - - return ReducedPartRdx; - } - case VPInstruction::ExtractLastLane: - case VPInstruction::ExtractPenultimateElement: { - unsigned Offset = - getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1; - Value *Res; - if (State.VF.isVector()) { - assert(Offset <= State.VF.getKnownMinValue() && - "invalid offset to extract from"); - // Extract lane VF - Offset from the operand. - Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); - } else { - // TODO: Remove ExtractLastLane for scalar VFs. - assert(Offset <= 1 && "invalid offset to extract from"); - Res = State.get(getOperand(0)); - } - if (isa(Res)) - Res->setName(Name); - return Res; - } - case VPInstruction::LogicalAnd: { - Value *A = State.get(getOperand(0)); - Value *B = State.get(getOperand(1)); - return Builder.CreateLogicalAnd(A, B, Name); - } - case VPInstruction::PtrAdd: { - assert(vputils::onlyFirstLaneUsed(this) && - "can only generate first lane for PtrAdd"); - Value *Ptr = State.get(getOperand(0), VPLane(0)); - Value *Addend = State.get(getOperand(1), VPLane(0)); - return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); - } - case VPInstruction::WidePtrAdd: { - Value *Ptr = - State.get(getOperand(0), vputils::isSingleScalar(getOperand(0))); - Value *Addend = State.get(getOperand(1)); - return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); - } - case VPInstruction::AnyOf: { - Value *Res = Builder.CreateFreeze(State.get(getOperand(0))); - for (VPValue *Op : drop_begin(operands())) - Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op))); - return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); - } - case VPInstruction::ExtractLane: { - Value *LaneToExtract = State.get(getOperand(0), true); - Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0)); - Value *Res = nullptr; - Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); - - for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) { - Value *VectorStart = - Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1)); - Value *VectorIdx = Idx == 1 - ? LaneToExtract - : Builder.CreateSub(LaneToExtract, VectorStart); - Value *Ext = State.VF.isScalar() - ? State.get(getOperand(Idx)) - : Builder.CreateExtractElement( - State.get(getOperand(Idx)), VectorIdx); - if (Res) { - Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart); - Res = Builder.CreateSelect(Cmp, Ext, Res); - } else { - Res = Ext; - } - } - return Res; - } - case VPInstruction::FirstActiveLane: { - if (getNumOperands() == 1) { - Value *Mask = State.get(getOperand(0)); - return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask, - /*ZeroIsPoison=*/false, Name); - } - // If there are multiple operands, create a chain of selects to pick the - // first operand with an active lane and add the number of lanes of the - // preceding operands. - Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt64Ty(), State.VF); - unsigned LastOpIdx = getNumOperands() - 1; - Value *Res = nullptr; - for (int Idx = LastOpIdx; Idx >= 0; --Idx) { - Value *TrailingZeros = - State.VF.isScalar() - ? Builder.CreateZExt( - Builder.CreateICmpEQ(State.get(getOperand(Idx)), - Builder.getFalse()), - Builder.getInt64Ty()) - : Builder.CreateCountTrailingZeroElems( - Builder.getInt64Ty(), State.get(getOperand(Idx)), - /*ZeroIsPoison=*/false, Name); - Value *Current = Builder.CreateAdd( - Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); - if (Res) { - Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF); - Res = Builder.CreateSelect(Cmp, Current, Res); - } else { - Res = Current; - } - } - - return Res; - } - case VPInstruction::ResumeForEpilogue: - return State.get(getOperand(0), true); - default: - llvm_unreachable("Unsupported opcode for instruction"); - } -} - -InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( - unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const { - Type *ScalarTy = Ctx.Types.inferScalarType(this); - Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy; - switch (Opcode) { - case Instruction::FNeg: - return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - TargetTransformInfo::OperandValueInfo RHSInfo = { - TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; - - if (VF.isVector()) { - // Certain instructions can be cheaper to vectorize if they have a - // constant second vector operand. One example of this are shifts on x86. - VPValue *RHS = getOperand(1); - RHSInfo = Ctx.getOperandInfo(RHS); - - if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && - getOperand(1)->isDefinedOutsideLoopRegions()) - RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; - } - - Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); - SmallVector Operands; - if (CtxI) - Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); - return Ctx.TTI.getArithmeticInstrCost( - Opcode, ResultTy, Ctx.CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - RHSInfo, Operands, CtxI, &Ctx.TLI); - } - case Instruction::Freeze: - // This opcode is unknown. Assume that it is the same as 'mul'. - return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy, - Ctx.CostKind); - case Instruction::ExtractValue: - return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, - Ctx.CostKind); - case Instruction::ICmp: - case Instruction::FCmp: { - Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0)); - Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy; - Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); - return Ctx.TTI.getCmpSelInstrCost( - Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(), - Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, - {TTI::OK_AnyValue, TTI::OP_None}, CtxI); - } - } - llvm_unreachable("called for unsupported opcode"); -} - -InstructionCost VPInstruction::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - if (Instruction::isBinaryOp(getOpcode())) { - if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) { - // TODO: Compute cost for VPInstructions without underlying values once - // the legacy cost model has been retired. - return 0; - } - - assert(!doesGeneratePerAllLanes() && - "Should only generate a vector value or single scalar, not scalars " - "for all lanes."); - return getCostForRecipeWithOpcode( - getOpcode(), - vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx); - } - - switch (getOpcode()) { - case Instruction::Select: { - llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; - match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())); - auto *CondTy = Ctx.Types.inferScalarType(getOperand(0)); - auto *VecTy = Ctx.Types.inferScalarType(getOperand(1)); - if (!vputils::onlyFirstLaneUsed(this)) { - CondTy = toVectorTy(CondTy, VF); - VecTy = toVectorTy(VecTy, VF); - } - return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred, - Ctx.CostKind); - } - case Instruction::ExtractElement: - case VPInstruction::ExtractLane: { - if (VF.isScalar()) { - // ExtractLane with VF=1 takes care of handling extracting across multiple - // parts. - return 0; - } - - // Add on the cost of extracting the element. - auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); - return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, - Ctx.CostKind); - } - case VPInstruction::AnyOf: { - auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - return Ctx.TTI.getArithmeticReductionCost( - Instruction::Or, cast(VecTy), std::nullopt, Ctx.CostKind); - } - case VPInstruction::FirstActiveLane: { - Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); - if (VF.isScalar()) - return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy), - CmpInst::ICMP_EQ, Ctx.CostKind); - // Calculate the cost of determining the lane index. - auto *PredTy = toVectorTy(ScalarTy, VF); - IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, - Type::getInt64Ty(Ctx.LLVMCtx), - {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); - return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); - } - case VPInstruction::LastActiveLane: { - Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); - if (VF.isScalar()) - return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, - CmpInst::makeCmpResultType(ScalarTy), - CmpInst::ICMP_EQ, Ctx.CostKind); - // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB. - auto *PredTy = toVectorTy(ScalarTy, VF); - IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, - Type::getInt64Ty(Ctx.LLVMCtx), - {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); - InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); - // Add cost of NOT operation on the predicate. - Cost += Ctx.TTI.getArithmeticInstrCost( - Instruction::Xor, PredTy, Ctx.CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - {TargetTransformInfo::OK_UniformConstantValue, - TargetTransformInfo::OP_None}); - // Add cost of SUB operation on the index. - Cost += Ctx.TTI.getArithmeticInstrCost( - Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind); - return Cost; - } - case VPInstruction::FirstOrderRecurrenceSplice: { - assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); - SmallVector Mask(VF.getKnownMinValue()); - std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); - Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - - return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, - cast(VectorTy), - cast(VectorTy), Mask, - Ctx.CostKind, VF.getKnownMinValue() - 1); - } - case VPInstruction::ActiveLaneMask: { - Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); - unsigned Multiplier = - cast(getOperand(2)->getLiveInIRValue())->getZExtValue(); - Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier); - IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, - {ArgTy, ArgTy}); - return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); - } - case VPInstruction::ExplicitVectorLength: { - Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0)); - Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx); - Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx); - IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length, - I32Ty, {Arg0Ty, I32Ty, I1Ty}); - return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); - } - case VPInstruction::ExtractLastLane: { - // Add on the cost of extracting the element. - auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); - return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, - VecTy, Ctx.CostKind, 0); - } - case VPInstruction::ExtractPenultimateElement: - if (VF == ElementCount::getScalable(1)) - return InstructionCost::getInvalid(); - [[fallthrough]]; - default: - // TODO: Compute cost other VPInstructions once the legacy cost model has - // been retired. - assert(!getUnderlyingValue() && - "unexpected VPInstruction witht underlying value"); - return 0; - } -} - -bool VPInstruction::isVectorToScalar() const { - return getOpcode() == VPInstruction::ExtractLastLane || - getOpcode() == VPInstruction::ExtractPenultimateElement || - getOpcode() == Instruction::ExtractElement || - getOpcode() == VPInstruction::ExtractLane || - getOpcode() == VPInstruction::FirstActiveLane || - getOpcode() == VPInstruction::LastActiveLane || - getOpcode() == VPInstruction::ComputeAnyOfResult || - getOpcode() == VPInstruction::ComputeFindIVResult || - getOpcode() == VPInstruction::ComputeReductionResult || - getOpcode() == VPInstruction::AnyOf; -} - -bool VPInstruction::isSingleScalar() const { - switch (getOpcode()) { - case Instruction::PHI: - case VPInstruction::ExplicitVectorLength: - case VPInstruction::ResumeForEpilogue: - case VPInstruction::VScale: - return true; - default: - return isScalarCast(); - } -} - -void VPInstruction::execute(VPTransformState &State) { - assert(!State.Lane && "VPInstruction executing an Lane"); - IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - assert(flagsValidForOpcode(getOpcode()) && - "Set flags not supported for the provided opcode"); - if (hasFastMathFlags()) - State.Builder.setFastMathFlags(getFastMathFlags()); - Value *GeneratedValue = generate(State); - if (!hasResult()) - return; - assert(GeneratedValue && "generate must produce a value"); - bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && - (vputils::onlyFirstLaneUsed(this) || - isVectorToScalar() || isSingleScalar()); - assert((((GeneratedValue->getType()->isVectorTy() || - GeneratedValue->getType()->isStructTy()) == - !GeneratesPerFirstLaneOnly) || - State.VF.isScalar()) && - "scalar value but not only first lane defined"); - State.set(this, GeneratedValue, - /*IsScalar*/ GeneratesPerFirstLaneOnly); -} - -bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { - if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) - return false; - switch (getOpcode()) { - case Instruction::GetElementPtr: - case Instruction::ExtractElement: - case Instruction::Freeze: - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: - case Instruction::PHI: - case VPInstruction::AnyOf: - case VPInstruction::BranchOnCond: - case VPInstruction::BranchOnCount: - case VPInstruction::Broadcast: - case VPInstruction::BuildStructVector: - case VPInstruction::BuildVector: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::ExtractLane: - case VPInstruction::ExtractLastLane: - case VPInstruction::ExtractLastPart: - case VPInstruction::ExtractPenultimateElement: - case VPInstruction::ActiveLaneMask: - case VPInstruction::ExplicitVectorLength: - case VPInstruction::FirstActiveLane: - case VPInstruction::LastActiveLane: - case VPInstruction::FirstOrderRecurrenceSplice: - case VPInstruction::LogicalAnd: - case VPInstruction::Not: - case VPInstruction::PtrAdd: - case VPInstruction::WideIVStep: - case VPInstruction::WidePtrAdd: - case VPInstruction::StepVector: - case VPInstruction::ReductionStartVector: - case VPInstruction::VScale: - case VPInstruction::Unpack: - return false; - default: - return true; - } -} - -bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { - assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) - return vputils::onlyFirstLaneUsed(this); - - switch (getOpcode()) { - default: - return false; - case Instruction::ExtractElement: - return Op == getOperand(1); - case Instruction::PHI: - return true; - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: - case Instruction::Or: - case Instruction::Freeze: - case VPInstruction::Not: - // TODO: Cover additional opcodes. - return vputils::onlyFirstLaneUsed(this); - case VPInstruction::ActiveLaneMask: - case VPInstruction::ExplicitVectorLength: - case VPInstruction::CalculateTripCountMinusVF: - case VPInstruction::CanonicalIVIncrementForPart: - case VPInstruction::BranchOnCount: - case VPInstruction::BranchOnCond: - case VPInstruction::Broadcast: - case VPInstruction::ReductionStartVector: - return true; - case VPInstruction::BuildStructVector: - case VPInstruction::BuildVector: - // Before replicating by VF, Build(Struct)Vector uses all lanes of the - // operand, after replicating its operands only the first lane is used. - // Before replicating, it will have only a single operand. - return getNumOperands() > 1; - case VPInstruction::PtrAdd: - return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); - case VPInstruction::WidePtrAdd: - // WidePtrAdd supports scalar and vector base addresses. - return false; - case VPInstruction::ComputeAnyOfResult: - case VPInstruction::ComputeFindIVResult: - return Op == getOperand(1); - case VPInstruction::ExtractLane: - return Op == getOperand(0); - }; - llvm_unreachable("switch should return"); -} - -bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const { - assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - if (Instruction::isBinaryOp(getOpcode())) - return vputils::onlyFirstPartUsed(this); - - switch (getOpcode()) { - default: - return false; - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: - return vputils::onlyFirstPartUsed(this); - case VPInstruction::BranchOnCount: - case VPInstruction::BranchOnCond: - case VPInstruction::CanonicalIVIncrementForPart: - return true; - }; - llvm_unreachable("switch should return"); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInstruction::dump() const { - VPSlotTracker SlotTracker(getParent()->getPlan()); - printRecipe(dbgs(), "", SlotTracker); -} - -void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; - - if (hasResult()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - - switch (getOpcode()) { - case VPInstruction::Not: - O << "not"; - break; - case VPInstruction::SLPLoad: - O << "combined load"; - break; - case VPInstruction::SLPStore: - O << "combined store"; - break; - case VPInstruction::ActiveLaneMask: - O << "active lane mask"; - break; - case VPInstruction::ExplicitVectorLength: - O << "EXPLICIT-VECTOR-LENGTH"; - break; - case VPInstruction::FirstOrderRecurrenceSplice: - O << "first-order splice"; - break; - case VPInstruction::BranchOnCond: - O << "branch-on-cond"; - break; - case VPInstruction::CalculateTripCountMinusVF: - O << "TC > VF ? TC - VF : 0"; - break; - case VPInstruction::CanonicalIVIncrementForPart: - O << "VF * Part +"; - break; - case VPInstruction::BranchOnCount: - O << "branch-on-count"; - break; - case VPInstruction::Broadcast: - O << "broadcast"; - break; - case VPInstruction::BuildStructVector: - O << "buildstructvector"; - break; - case VPInstruction::BuildVector: - O << "buildvector"; - break; - case VPInstruction::ExtractLane: - O << "extract-lane"; - break; - case VPInstruction::ExtractLastLane: - O << "extract-last-lane"; - break; - case VPInstruction::ExtractLastPart: - O << "extract-last-part"; - break; - case VPInstruction::ExtractPenultimateElement: - O << "extract-penultimate-element"; - break; - case VPInstruction::ComputeAnyOfResult: - O << "compute-anyof-result"; - break; - case VPInstruction::ComputeFindIVResult: - O << "compute-find-iv-result"; - break; - case VPInstruction::ComputeReductionResult: - O << "compute-reduction-result"; - break; - case VPInstruction::LogicalAnd: - O << "logical-and"; - break; - case VPInstruction::PtrAdd: - O << "ptradd"; - break; - case VPInstruction::WidePtrAdd: - O << "wide-ptradd"; - break; - case VPInstruction::AnyOf: - O << "any-of"; - break; - case VPInstruction::FirstActiveLane: - O << "first-active-lane"; - break; - case VPInstruction::LastActiveLane: - O << "last-active-lane"; - break; - case VPInstruction::ReductionStartVector: - O << "reduction-start-vector"; - break; - case VPInstruction::ResumeForEpilogue: - O << "resume-for-epilogue"; - break; - case VPInstruction::Unpack: - O << "unpack"; - break; - default: - O << Instruction::getOpcodeName(getOpcode()); - } - - printFlags(O); - printOperands(O, SlotTracker); -} -#endif - -void VPInstructionWithType::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); - if (isScalarCast()) { - Value *Op = State.get(getOperand(0), VPLane(0)); - Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()), - Op, ResultTy); - State.set(this, Cast, VPLane(0)); - return; - } - switch (getOpcode()) { - case VPInstruction::StepVector: { - Value *StepVector = - State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF)); - State.set(this, StepVector); - break; - } - case VPInstruction::VScale: { - Value *VScale = State.Builder.CreateVScale(ResultTy); - State.set(this, VScale, true); - break; - } - - default: - llvm_unreachable("opcode not implemented yet"); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; - printAsOperand(O, SlotTracker); - O << " = "; - - switch (getOpcode()) { - case VPInstruction::WideIVStep: - O << "wide-iv-step "; - printOperands(O, SlotTracker); - break; - case VPInstruction::StepVector: - O << "step-vector " << *ResultTy; - break; - case VPInstruction::VScale: - O << "vscale " << *ResultTy; - break; - default: - assert(Instruction::isCast(getOpcode()) && "unhandled opcode"); - O << Instruction::getOpcodeName(getOpcode()) << " "; - printOperands(O, SlotTracker); - O << " to " << *ResultTy; - } -} -#endif - -void VPPhi::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); - PHINode *NewPhi = State.Builder.CreatePHI( - State.TypeAnalysis.inferScalarType(this), 2, getName()); - unsigned NumIncoming = getNumIncoming(); - if (getParent() != getParent()->getPlan()->getScalarPreheader()) { - // TODO: Fixup all incoming values of header phis once recipes defining them - // are introduced. - NumIncoming = 1; - } - for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) { - Value *IncV = State.get(getIncomingValue(Idx), VPLane(0)); - BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx)); - NewPhi->addIncoming(IncV, PredBB); - } - State.set(this, NewPhi, VPLane(0)); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; - printAsOperand(O, SlotTracker); - O << " = phi "; - printPhiOperands(O, SlotTracker); -} -#endif - -VPIRInstruction *VPIRInstruction ::create(Instruction &I) { - if (auto *Phi = dyn_cast(&I)) - return new VPIRPhi(*Phi); - return new VPIRInstruction(I); -} - -void VPIRInstruction::execute(VPTransformState &State) { - assert(!isa(this) && getNumOperands() == 0 && - "PHINodes must be handled by VPIRPhi"); - // Advance the insert point after the wrapped IR instruction. This allows - // interleaving VPIRInstructions and other recipes. - State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); -} - -InstructionCost VPIRInstruction::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - // The recipe wraps an existing IR instruction on the border of VPlan's scope, - // hence it does not contribute to the cost-modeling for the VPlan. - return 0; -} - -void VPIRInstruction::extractLastLaneOfLastPartOfFirstOperand( - VPBuilder &Builder) { - assert(isa(getInstruction()) && - "can only update exiting operands to phi nodes"); - assert(getNumOperands() > 0 && "must have at least one operand"); - VPValue *Exiting = getOperand(0); - if (Exiting->isLiveIn()) - return; - - Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting); - Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting); - setOperand(0, Exiting); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "IR " << I; -} -#endif - -void VPIRPhi::execute(VPTransformState &State) { - PHINode *Phi = &getIRPhi(); - for (const auto &[Idx, Op] : enumerate(operands())) { - VPValue *ExitValue = Op; - auto Lane = vputils::isSingleScalar(ExitValue) - ? VPLane::getFirstLane() - : VPLane::getLastLaneForVF(State.VF); - VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; - auto *PredVPBB = Pred->getExitingBasicBlock(); - BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; - // Set insertion point in PredBB in case an extract needs to be generated. - // TODO: Model extracts explicitly. - State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); - Value *V = State.get(ExitValue, VPLane(Lane)); - // If there is no existing block for PredBB in the phi, add a new incoming - // value. Otherwise update the existing incoming value for PredBB. - if (Phi->getBasicBlockIndex(PredBB) == -1) - Phi->addIncoming(V, PredBB); - else - Phi->setIncomingValueForBlock(PredBB, V); - } - - // Advance the insert point after the wrapped IR instruction. This allows - // interleaving VPIRInstructions and other recipes. - State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator())); -} - -void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { - VPRecipeBase *R = const_cast(getAsRecipe()); - assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && - "Number of phi operands must match number of predecessors"); - unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock); - R->removeOperand(Position); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPPhiAccessors::printPhiOperands(raw_ostream &O, - VPSlotTracker &SlotTracker) const { - interleaveComma(enumerate(getAsRecipe()->operands()), O, - [this, &O, &SlotTracker](auto Op) { - O << "[ "; - Op.value()->printAsOperand(O, SlotTracker); - O << ", "; - getIncomingBlock(Op.index())->printAsOperand(O); - O << " ]"; - }); -} -#endif - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - VPIRInstruction::printRecipe(O, Indent, SlotTracker); - - if (getNumOperands() != 0) { - O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; - interleaveComma(incoming_values_and_blocks(), O, - [&O, &SlotTracker](auto Op) { - std::get<0>(Op)->printAsOperand(O, SlotTracker); - O << " from "; - std::get<1>(Op)->printAsOperand(O); - }); - O << ")"; - } -} -#endif - -void VPIRMetadata::applyMetadata(Instruction &I) const { - for (const auto &[Kind, Node] : Metadata) - I.setMetadata(Kind, Node); -} - -void VPIRMetadata::intersect(const VPIRMetadata &Other) { - SmallVector> MetadataIntersection; - for (const auto &[KindA, MDA] : Metadata) { - for (const auto &[KindB, MDB] : Other.Metadata) { - if (KindA == KindB && MDA == MDB) { - MetadataIntersection.emplace_back(KindA, MDA); - break; - } - } - } - Metadata = std::move(MetadataIntersection); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { - const Module *M = SlotTracker.getModule(); - if (Metadata.empty() || !M) - return; - - ArrayRef MDNames = SlotTracker.getMDNames(); - O << " ("; - interleaveComma(Metadata, O, [&](const auto &KindNodePair) { - auto [Kind, Node] = KindNodePair; - assert(Kind < MDNames.size() && !MDNames[Kind].empty() && - "Unexpected unnamed metadata kind"); - O << "!" << MDNames[Kind] << " "; - Node->printAsOperand(O, M); - }); - O << ")"; -} -#endif - -void VPWidenCallRecipe::execute(VPTransformState &State) { - assert(State.VF.isVector() && "not widening"); - assert(Variant != nullptr && "Can't create vector function."); - - FunctionType *VFTy = Variant->getFunctionType(); - // Add return type if intrinsic is overloaded on it. - SmallVector Args; - for (const auto &I : enumerate(args())) { - Value *Arg; - // Some vectorized function variants may also take a scalar argument, - // e.g. linear parameters for pointers. This needs to be the scalar value - // from the start of the respective part when interleaving. - if (!VFTy->getParamType(I.index())->isVectorTy()) - Arg = State.get(I.value(), VPLane(0)); - else - Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); - Args.push_back(Arg); - } - - auto *CI = cast_or_null(getUnderlyingValue()); - SmallVector OpBundles; - if (CI) - CI->getOperandBundlesAsDefs(OpBundles); - - CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); - applyFlags(*V); - applyMetadata(*V); - V->setCallingConv(Variant->getCallingConv()); - - if (!V->getType()->isVoidTy()) - State.set(this, V); -} - -InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), - Variant->getFunctionType()->params(), - Ctx.CostKind); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-CALL "; - - Function *CalledFn = getCalledScalarFunction(); - if (CalledFn->getReturnType()->isVoidTy()) - O << "void "; - else { - printAsOperand(O, SlotTracker); - O << " = "; - } - - O << "call"; - printFlags(O); - O << " @" << CalledFn->getName() << "("; - interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) { - Op->printAsOperand(O, SlotTracker); - }); - O << ")"; - - O << " (using library function"; - if (Variant->hasName()) - O << ": " << Variant->getName(); - O << ")"; -} -#endif - -void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { - assert(State.VF.isVector() && "not widening"); - - SmallVector TysForDecl; - // Add return type if intrinsic is overloaded on it. - if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI)) - TysForDecl.push_back(VectorType::get(getResultType(), State.VF)); - SmallVector Args; - for (const auto &I : enumerate(operands())) { - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - Value *Arg; - if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(), - State.TTI)) - Arg = State.get(I.value(), VPLane(0)); - else - Arg = State.get(I.value(), usesFirstLaneOnly(I.value())); - if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(), - State.TTI)) - TysForDecl.push_back(Arg->getType()); - Args.push_back(Arg); - } - - // Use vector version of the intrinsic. - Module *M = State.Builder.GetInsertBlock()->getModule(); - Function *VectorF = - Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); - assert(VectorF && - "Can't retrieve vector intrinsic or vector-predication intrinsics."); - - auto *CI = cast_or_null(getUnderlyingValue()); - SmallVector OpBundles; - if (CI) - CI->getOperandBundlesAsDefs(OpBundles); - - CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); - - applyFlags(*V); - applyMetadata(*V); - - if (!V->getType()->isVoidTy()) - State.set(this, V); -} - -/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. -static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, - ArrayRef Operands, - const VPRecipeWithIRFlags &R, - ElementCount VF, - VPCostContext &Ctx) { - // Some backends analyze intrinsic arguments to determine cost. Use the - // underlying value for the operand if it has one. Otherwise try to use the - // operand of the underlying call instruction, if there is one. Otherwise - // clear Arguments. - // TODO: Rework TTI interface to be independent of concrete IR values. - SmallVector Arguments; - for (const auto &[Idx, Op] : enumerate(Operands)) { - auto *V = Op->getUnderlyingValue(); - if (!V) { - if (auto *UI = dyn_cast_or_null(R.getUnderlyingValue())) { - Arguments.push_back(UI->getArgOperand(Idx)); - continue; - } - Arguments.clear(); - break; - } - Arguments.push_back(V); - } - - Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); - Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy; - SmallVector ParamTys; - for (const VPValue *Op : Operands) { - ParamTys.push_back(VF.isVector() - ? toVectorTy(Ctx.Types.inferScalarType(Op), VF) - : Ctx.Types.inferScalarType(Op)); - } - - // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. - FastMathFlags FMF = - R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags(); - IntrinsicCostAttributes CostAttrs( - ID, RetTy, Arguments, ParamTys, FMF, - dyn_cast_or_null(R.getUnderlyingValue()), - InstructionCost::getInvalid(), &Ctx.TLI); - return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); -} - -InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - SmallVector ArgOps(operands()); - return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx); -} - -StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { - return Intrinsic::getBaseName(VectorIntrinsicID); -} - -bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const { - assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return all_of(enumerate(operands()), [this, &Op](const auto &X) { - auto [Idx, V] = X; - return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(), - Idx, nullptr); - }); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-INTRINSIC "; - if (ResultTy->isVoidTy()) { - O << "void "; - } else { - printAsOperand(O, SlotTracker); - O << " = "; - } - - O << "call"; - printFlags(O); - O << getIntrinsicName() << "("; - - interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { - Op->printAsOperand(O, SlotTracker); - }); - O << ")"; -} -#endif - -void VPHistogramRecipe::execute(VPTransformState &State) { - IRBuilderBase &Builder = State.Builder; - - Value *Address = State.get(getOperand(0)); - Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true); - VectorType *VTy = cast(Address->getType()); - - // The histogram intrinsic requires a mask even if the recipe doesn't; - // if the mask operand was omitted then all lanes should be executed and - // we just need to synthesize an all-true mask. - Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) - Mask = State.get(VPMask); - else - Mask = - Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1)); - - // If this is a subtract, we want to invert the increment amount. We may - // add a separate intrinsic in future, but for now we'll try this. - if (Opcode == Instruction::Sub) - IncAmt = Builder.CreateNeg(IncAmt); - else - assert(Opcode == Instruction::Add && "only add or sub supported for now"); - - State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add, - {VTy, IncAmt->getType()}, - {Address, IncAmt, Mask}); -} - -InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - // FIXME: Take the gather and scatter into account as well. For now we're - // generating the same cost as the fallback path, but we'll likely - // need to create a new TTI method for determining the cost, including - // whether we can use base + vec-of-smaller-indices or just - // vec-of-pointers. - assert(VF.isVector() && "Invalid VF for histogram cost"); - Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0)); - VPValue *IncAmt = getOperand(1); - Type *IncTy = Ctx.Types.inferScalarType(IncAmt); - VectorType *VTy = VectorType::get(IncTy, VF); - - // Assume that a non-constant update value (or a constant != 1) requires - // a multiply, and add that into the cost. - InstructionCost MulCost = - Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind); - if (IncAmt->isLiveIn()) { - ConstantInt *CI = dyn_cast(IncAmt->getLiveInIRValue()); - - if (CI && CI->getZExtValue() == 1) - MulCost = TTI::TCC_Free; - } - - // Find the cost of the histogram operation itself. - Type *PtrTy = VectorType::get(AddressTy, VF); - Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF); - IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, - Type::getVoidTy(Ctx.LLVMCtx), - {PtrTy, IncTy, MaskTy}); - - // Add the costs together with the add/sub operation. - return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost + - Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-HISTOGRAM buckets: "; - getOperand(0)->printAsOperand(O, SlotTracker); - - if (Opcode == Instruction::Sub) - O << ", dec: "; - else { - assert(Opcode == Instruction::Add); - O << ", inc: "; - } - getOperand(1)->printAsOperand(O, SlotTracker); - - if (VPValue *Mask = getMask()) { - O << ", mask: "; - Mask->printAsOperand(O, SlotTracker); - } -} - -void VPWidenSelectRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-SELECT "; - printAsOperand(O, SlotTracker); - O << " = select"; - printFlags(O); - getOperand(0)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(1)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(2)->printAsOperand(O, SlotTracker); - O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)" - : ""); -} -#endif - -void VPWidenSelectRecipe::execute(VPTransformState &State) { - Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond())); - - Value *Op0 = State.get(getOperand(1)); - Value *Op1 = State.get(getOperand(2)); - Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); - State.set(this, Sel); - if (auto *I = dyn_cast(Sel)) { - if (isa(I)) - applyFlags(*I); - applyMetadata(*I); - } -} - -InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - SelectInst *SI = cast(getUnderlyingValue()); - bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); - Type *ScalarTy = Ctx.Types.inferScalarType(this); - Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - - VPValue *Op0, *Op1; - if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && - (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || - match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { - // select x, y, false --> x & y - // select x, true, y --> x | y - const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0); - const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1); - - SmallVector Operands; - if (all_of(operands(), - [](VPValue *Op) { return Op->getUnderlyingValue(); })) - Operands.append(SI->op_begin(), SI->op_end()); - bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); - return Ctx.TTI.getArithmeticInstrCost( - IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, - Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); - } - - Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); - if (!ScalarCond) - CondTy = VectorType::get(CondTy, VF); - - CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; - if (auto *Cmp = dyn_cast(SI->getCondition())) - Pred = Cmp->getPredicate(); - return Ctx.TTI.getCmpSelInstrCost( - Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind, - {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); -} - -VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { - AllowReassoc = FMF.allowReassoc(); - NoNaNs = FMF.noNaNs(); - NoInfs = FMF.noInfs(); - NoSignedZeros = FMF.noSignedZeros(); - AllowReciprocal = FMF.allowReciprocal(); - AllowContract = FMF.allowContract(); - ApproxFunc = FMF.approxFunc(); -} - -#if !defined(NDEBUG) -bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { - switch (OpType) { - case OperationType::OverflowingBinOp: - return Opcode == Instruction::Add || Opcode == Instruction::Sub || - Opcode == Instruction::Mul || Opcode == Instruction::Shl || - Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart; - case OperationType::Trunc: - return Opcode == Instruction::Trunc; - case OperationType::DisjointOp: - return Opcode == Instruction::Or; - case OperationType::PossiblyExactOp: - return Opcode == Instruction::AShr || Opcode == Instruction::LShr || - Opcode == Instruction::UDiv || Opcode == Instruction::SDiv; - case OperationType::GEPOp: - return Opcode == Instruction::GetElementPtr || - Opcode == VPInstruction::PtrAdd || - Opcode == VPInstruction::WidePtrAdd; - case OperationType::FPMathOp: - return Opcode == Instruction::Call || Opcode == Instruction::FAdd || - Opcode == Instruction::FMul || Opcode == Instruction::FSub || - Opcode == Instruction::FNeg || Opcode == Instruction::FDiv || - Opcode == Instruction::FRem || Opcode == Instruction::FPExt || - Opcode == Instruction::FPTrunc || Opcode == Instruction::Select || - Opcode == VPInstruction::WideIVStep || - Opcode == VPInstruction::ReductionStartVector || - Opcode == VPInstruction::ComputeReductionResult; - case OperationType::FCmp: - return Opcode == Instruction::FCmp; - case OperationType::NonNegOp: - return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP; - case OperationType::Cmp: - return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; - case OperationType::Other: - return true; - } - llvm_unreachable("Unknown OperationType enum"); -} -#endif - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPIRFlags::printFlags(raw_ostream &O) const { - switch (OpType) { - case OperationType::Cmp: - O << " " << CmpInst::getPredicateName(getPredicate()); - break; - case OperationType::FCmp: - O << " " << CmpInst::getPredicateName(getPredicate()); - getFastMathFlags().print(O); - break; - case OperationType::DisjointOp: - if (DisjointFlags.IsDisjoint) - O << " disjoint"; - break; - case OperationType::PossiblyExactOp: - if (ExactFlags.IsExact) - O << " exact"; - break; - case OperationType::OverflowingBinOp: - if (WrapFlags.HasNUW) - O << " nuw"; - if (WrapFlags.HasNSW) - O << " nsw"; - break; - case OperationType::Trunc: - if (TruncFlags.HasNUW) - O << " nuw"; - if (TruncFlags.HasNSW) - O << " nsw"; - break; - case OperationType::FPMathOp: - getFastMathFlags().print(O); - break; - case OperationType::GEPOp: - if (GEPFlags.isInBounds()) - O << " inbounds"; - else if (GEPFlags.hasNoUnsignedSignedWrap()) - O << " nusw"; - if (GEPFlags.hasNoUnsignedWrap()) - O << " nuw"; - break; - case OperationType::NonNegOp: - if (NonNegFlags.NonNeg) - O << " nneg"; - break; - case OperationType::Other: - break; - } - O << " "; -} -#endif - -void VPWidenRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - switch (Opcode) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - SmallVector Ops; - for (VPValue *VPOp : operands()) - Ops.push_back(State.get(VPOp)); - - Value *V = Builder.CreateNAryOp(Opcode, Ops); - - if (auto *VecOp = dyn_cast(V)) { - applyFlags(*VecOp); - applyMetadata(*VecOp); - } - - // Use this vector value for all users of the original instruction. - State.set(this, V); - break; - } - case Instruction::ExtractValue: { - assert(getNumOperands() == 2 && "expected single level extractvalue"); - Value *Op = State.get(getOperand(0)); - auto *CI = cast(getOperand(1)->getLiveInIRValue()); - Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue()); - State.set(this, Extract); - break; - } - case Instruction::Freeze: { - Value *Op = State.get(getOperand(0)); - Value *Freeze = Builder.CreateFreeze(Op); - State.set(this, Freeze); - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = Opcode == Instruction::FCmp; - Value *A = State.get(getOperand(0)); - Value *B = State.get(getOperand(1)); - Value *C = nullptr; - if (FCmp) { - C = Builder.CreateFCmp(getPredicate(), A, B); - } else { - C = Builder.CreateICmp(getPredicate(), A, B); - } - if (auto *I = dyn_cast(C)) { - applyFlags(*I); - applyMetadata(*I); - } - State.set(this, C); - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " - << Instruction::getOpcodeName(Opcode)); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. - -#if !defined(NDEBUG) - // Verify that VPlan type inference results agree with the type of the - // generated values. - assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) == - State.get(this)->getType() && - "inferred type and type from generated instructions do not match"); -#endif -} - -InstructionCost VPWidenRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - switch (Opcode) { - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - // If the div/rem operation isn't safe to speculate and requires - // predication, then the only way we can even create a vplan is to insert - // a select on the second input operand to ensure we use the value of 1 - // for the inactive lanes. The select will be costed separately. - case Instruction::FNeg: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Freeze: - case Instruction::ExtractValue: - case Instruction::ICmp: - case Instruction::FCmp: - return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx); - default: - llvm_unreachable("Unsupported opcode for instruction"); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(Opcode); - printFlags(O); - printOperands(O, SlotTracker); -} -#endif - -void VPWidenCastRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - /// Vectorize casts. - assert(State.VF.isVector() && "Not vectorizing?"); - Type *DestTy = VectorType::get(getResultType(), State.VF); - VPValue *Op = getOperand(0); - Value *A = State.get(Op); - Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); - State.set(this, Cast); - if (auto *CastOp = dyn_cast(Cast)) { - applyFlags(*CastOp); - applyMetadata(*CastOp); - } -} - -InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - // TODO: In some cases, VPWidenCastRecipes are created but not considered in - // the legacy cost model, including truncates/extends when evaluating a - // reduction in a smaller type. - if (!getUnderlyingValue()) - return 0; - // Computes the CastContextHint from a recipes that may access memory. - auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { - if (VF.isScalar()) - return TTI::CastContextHint::Normal; - if (isa(R)) - return TTI::CastContextHint::Interleave; - if (const auto *ReplicateRecipe = dyn_cast(R)) - return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked - : TTI::CastContextHint::Normal; - const auto *WidenMemoryRecipe = dyn_cast(R); - if (WidenMemoryRecipe == nullptr) - return TTI::CastContextHint::None; - if (!WidenMemoryRecipe->isConsecutive()) - return TTI::CastContextHint::GatherScatter; - if (WidenMemoryRecipe->isReverse()) - return TTI::CastContextHint::Reversed; - if (WidenMemoryRecipe->isMasked()) - return TTI::CastContextHint::Masked; - return TTI::CastContextHint::Normal; - }; - - VPValue *Operand = getOperand(0); - TTI::CastContextHint CCH = TTI::CastContextHint::None; - // For Trunc/FPTrunc, get the context from the only user. - if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) && - !hasMoreThanOneUniqueUser() && getNumUsers() > 0) { - if (auto *StoreRecipe = dyn_cast(*user_begin())) - CCH = ComputeCCH(StoreRecipe); - } - // For Z/Sext, get the context from the operand. - else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || - Opcode == Instruction::FPExt) { - if (Operand->isLiveIn()) - CCH = TTI::CastContextHint::Normal; - else if (Operand->getDefiningRecipe()) - CCH = ComputeCCH(Operand->getDefiningRecipe()); - } - - auto *SrcTy = - cast(toVectorTy(Ctx.Types.inferScalarType(Operand), VF)); - auto *DestTy = cast(toVectorTy(getResultType(), VF)); - // Arm TTI will use the underlying instruction to determine the cost. - return Ctx.TTI.getCastInstrCost( - Opcode, DestTy, SrcTy, CCH, Ctx.CostKind, - dyn_cast_if_present(getUnderlyingValue())); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-CAST "; - printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(Opcode); - printFlags(O); - printOperands(O, SlotTracker); - O << " to " << *getResultType(); -} -#endif - -InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); -} - -/// A helper function that returns an integer or floating-point constant with -/// value C. -static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { - return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) - : ConstantFP::get(Ty, C); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenIntOrFpInductionRecipe::printRecipe( - raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent; - printAsOperand(O, SlotTracker); - O << " = WIDEN-INDUCTION"; - printFlags(O); - O << " "; - printOperands(O, SlotTracker); - - if (auto *TI = getTruncInst()) - O << " (truncated to " << *TI->getType() << ")"; -} -#endif - -bool VPWidenIntOrFpInductionRecipe::isCanonical() const { - // The step may be defined by a recipe in the preheader (e.g. if it requires - // SCEV expansion), but for the canonical induction the step is required to be - // 1, which is represented as live-in. - if (getStepValue()->getDefiningRecipe()) - return false; - auto *StepC = dyn_cast(getStepValue()->getLiveInIRValue()); - auto *StartC = dyn_cast(getStartValue()->getLiveInIRValue()); - return StartC && StartC->isZero() && StepC && StepC->isOne() && - getScalarType() == getRegion()->getCanonicalIVType(); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent; - printAsOperand(O, SlotTracker); - O << " = DERIVED-IV "; - getStartValue()->printAsOperand(O, SlotTracker); - O << " + "; - getOperand(1)->printAsOperand(O, SlotTracker); - O << " * "; - getStepValue()->printAsOperand(O, SlotTracker); -} -#endif - -void VPScalarIVStepsRecipe::execute(VPTransformState &State) { - // Fast-math-flags propagate from the original induction instruction. - IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); - if (hasFastMathFlags()) - State.Builder.setFastMathFlags(getFastMathFlags()); - - /// Compute scalar induction steps. \p ScalarIV is the scalar induction - /// variable on which to base the steps, \p Step is the size of the step. - - Value *BaseIV = State.get(getOperand(0), VPLane(0)); - Value *Step = State.get(getStepValue(), VPLane(0)); - IRBuilderBase &Builder = State.Builder; - - // Ensure step has the same type as that of scalar IV. - Type *BaseIVTy = BaseIV->getType()->getScalarType(); - assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!"); - - // We build scalar steps for both integer and floating-point induction - // variables. Here, we determine the kind of arithmetic we will perform. - Instruction::BinaryOps AddOp; - Instruction::BinaryOps MulOp; - if (BaseIVTy->isIntegerTy()) { - AddOp = Instruction::Add; - MulOp = Instruction::Mul; - } else { - AddOp = InductionOpcode; - MulOp = Instruction::FMul; - } - - // Determine the number of scalars we need to generate for each unroll - // iteration. - bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this); - // Compute the scalar steps and save the results in State. - Type *IntStepTy = - IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits()); - - unsigned StartLane = 0; - unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); - if (State.Lane) { - StartLane = State.Lane->getKnownLane(); - EndLane = StartLane + 1; - } - Value *StartIdx0; - if (getUnrollPart(*this) == 0) - StartIdx0 = ConstantInt::get(IntStepTy, 0); - else { - StartIdx0 = State.get(getOperand(2), true); - if (getUnrollPart(*this) != 1) { - StartIdx0 = - Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(), - getUnrollPart(*this))); - } - StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy); - } - - if (BaseIVTy->isFloatingPointTy()) - StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy); - - for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { - Value *StartIdx = Builder.CreateBinOp( - AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane)); - // The step returned by `createStepForVF` is a runtime-evaluated value - // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((State.VF.isScalable() || isa(StartIdx)) && - "Expected StartIdx to be folded to a constant when VF is not " - "scalable"); - auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); - auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); - State.set(this, Add, VPLane(Lane)); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent; - printAsOperand(O, SlotTracker); - O << " = SCALAR-STEPS "; - printOperands(O, SlotTracker); -} -#endif - -bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const { - assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return vputils::isSingleScalar(Op); -} - -void VPWidenGEPRecipe::execute(VPTransformState &State) { - assert(State.VF.isVector() && "not widening"); - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - assert( - any_of(operands(), - [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); }) && - "Expected at least one loop-variant operand"); - - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers unless VF is scalar. - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant()); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1))); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices, - "", getGEPNoWrapFlags()); - assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(this, NewGEP); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-GEP "; - O << (isPointerLoopInvariant() ? "Inv" : "Var"); - for (size_t I = 0; I < getNumOperands() - 1; ++I) - O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]"; - - O << " "; - printAsOperand(O, SlotTracker); - O << " = getelementptr"; - printFlags(O); - printOperands(O, SlotTracker); -} -#endif - -void VPVectorEndPointerRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - unsigned CurrentPart = getUnrollPart(*this); - const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); - Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this)); - - // The wide store needs to start at the last vector element. - Value *RunTimeVF = State.get(getVFValue(), VPLane(0)); - if (IndexTy != RunTimeVF->getType()) - RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy); - // NumElt = Stride * CurrentPart * RunTimeVF - Value *NumElt = Builder.CreateMul( - ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF); - // LastLane = Stride * (RunTimeVF - 1) - Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1)); - if (Stride != 1) - LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane); - Value *Ptr = State.get(getOperand(0), VPLane(0)); - Value *ResultPtr = - Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags()); - ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "", - getGEPNoWrapFlags()); - - State.set(this, ResultPtr, /*IsScalar*/ true); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent; - printAsOperand(O, SlotTracker); - O << " = vector-end-pointer"; - printFlags(O); - printOperands(O, SlotTracker); -} -#endif - -void VPVectorPointerRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - unsigned CurrentPart = getUnrollPart(*this); - const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout(); - Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this)); - Value *Ptr = State.get(getOperand(0), VPLane(0)); - - Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); - Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment, - "", getGEPNoWrapFlags()); - - State.set(this, ResultPtr, /*IsScalar*/ true); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent; - printAsOperand(O, SlotTracker); - O << " = vector-pointer"; - printFlags(O); - printOperands(O, SlotTracker); -} -#endif - -InstructionCost VPBlendRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); - return (getNumIncomingValues() - 1) * - Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, - CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "BLEND "; - printAsOperand(O, SlotTracker); - O << " ="; - if (getNumIncomingValues() == 1) { - // Not a User of any mask: not really blending, this is a - // single-predecessor phi. - O << " "; - getIncomingValue(0)->printAsOperand(O, SlotTracker); - } else { - for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { - O << " "; - getIncomingValue(I)->printAsOperand(O, SlotTracker); - if (I == 0) - continue; - O << "/"; - getMask(I)->printAsOperand(O, SlotTracker); - } - } -} -#endif - -void VPReductionRecipe::execute(VPTransformState &State) { - assert(!State.Lane && "Reduction being replicated."); - RecurKind Kind = getRecurrenceKind(); - assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - "In-loop AnyOf reductions aren't currently supported"); - // Propagate the fast-math flags carried by the underlying instruction. - IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(getFastMathFlags()); - Value *NewVecOp = State.get(getVecOp()); - if (VPValue *Cond = getCondOp()) { - Value *NewCond = State.get(Cond, State.VF.isScalar()); - VectorType *VecTy = dyn_cast(NewVecOp->getType()); - Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); - - Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags()); - if (State.VF.isVector()) - Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); - - Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start); - NewVecOp = Select; - } - Value *NewRed; - Value *NextInChain; - if (isOrdered()) { - Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); - if (State.VF.isVector()) - NewRed = - createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain); - else - NewRed = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), - PrevInChain, NewVecOp); - PrevInChain = NewRed; - NextInChain = NewRed; - } else if (isPartialReduction()) { - assert(Kind == RecurKind::Add && "Unexpected partial reduction kind"); - Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false); - NewRed = State.Builder.CreateIntrinsic( - PrevInChain->getType(), Intrinsic::vector_partial_reduce_add, - {PrevInChain, NewVecOp}, nullptr, "partial.reduce"); - PrevInChain = NewRed; - NextInChain = NewRed; - } else { - assert(isInLoop() && - "The reduction must either be ordered, partial or in-loop"); - Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); - NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) - NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain); - else - NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), - PrevInChain, NewRed); - } - State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction()); -} - -void VPReductionEVLRecipe::execute(VPTransformState &State) { - assert(!State.Lane && "Reduction being replicated."); - - auto &Builder = State.Builder; - // Propagate the fast-math flags carried by the underlying instruction. - IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(getFastMathFlags()); - - RecurKind Kind = getRecurrenceKind(); - Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); - Value *VecOp = State.get(getVecOp()); - Value *EVL = State.get(getEVL(), VPLane(0)); - - Value *Mask; - if (VPValue *CondOp = getCondOp()) - Mask = State.get(CondOp); - else - Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - - Value *NewRed; - if (isOrdered()) { - NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL); - } else { - NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) - NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); - else - NewRed = Builder.CreateBinOp( - (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, - Prev); - } - State.set(this, NewRed, /*IsScalar*/ true); -} - -InstructionCost VPReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - RecurKind RdxKind = getRecurrenceKind(); - Type *ElementTy = Ctx.Types.inferScalarType(this); - auto *VectorTy = cast(toVectorTy(ElementTy, VF)); - unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); - FastMathFlags FMFs = getFastMathFlags(); - std::optional OptionalFMF = - ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; - - if (isPartialReduction()) { - InstructionCost CondCost = 0; - if (isConditional()) { - CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; - auto *CondTy = cast( - toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF)); - CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, - CondTy, Pred, Ctx.CostKind); - } - return CondCost + Ctx.TTI.getPartialReductionCost( - Opcode, ElementTy, ElementTy, ElementTy, VF, - TargetTransformInfo::PR_None, - TargetTransformInfo::PR_None, std::nullopt, - Ctx.CostKind); - } - - // TODO: Support any-of reductions. - assert( - (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || - ForceTargetInstructionCost.getNumOccurrences() > 0) && - "Any-of reduction not implemented in VPlan-based cost model currently."); - - // Note that TTI should model the cost of moving result to the scalar register - // and the BinOp cost in the getMinMaxReductionCost(). - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { - Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); - return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); - } - - // Note that TTI should model the cost of moving result to the scalar register - // and the BinOp cost in the getArithmeticReductionCost(). - return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, - Ctx.CostKind); -} - -VPExpressionRecipe::VPExpressionRecipe( - ExpressionTypes ExpressionType, - ArrayRef ExpressionRecipes) - : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}), - ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) { - assert(!ExpressionRecipes.empty() && "Nothing to combine?"); - assert( - none_of(ExpressionRecipes, - [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && - "expression cannot contain recipes with side-effects"); - - // Maintain a copy of the expression recipes as a set of users. - SmallPtrSet ExpressionRecipesAsSetOfUsers; - for (auto *R : ExpressionRecipes) - ExpressionRecipesAsSetOfUsers.insert(R); - - // Recipes in the expression, except the last one, must only be used by - // (other) recipes inside the expression. If there are other users, external - // to the expression, use a clone of the recipe for external users. - for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) { - if (R != ExpressionRecipes.back() && - any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) { - return !ExpressionRecipesAsSetOfUsers.contains(U); - })) { - // There are users outside of the expression. Clone the recipe and use the - // clone those external users. - VPSingleDefRecipe *CopyForExtUsers = R->clone(); - R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers]( - VPUser &U, unsigned) { - return !ExpressionRecipesAsSetOfUsers.contains(&U); - }); - CopyForExtUsers->insertBefore(R); - } - if (R->getParent()) - R->removeFromParent(); - } - - // Internalize all external operands to the expression recipes. To do so, - // create new temporary VPValues for all operands defined by a recipe outside - // the expression. The original operands are added as operands of the - // VPExpressionRecipe itself. - for (auto *R : ExpressionRecipes) { - for (const auto &[Idx, Op] : enumerate(R->operands())) { - auto *Def = Op->getDefiningRecipe(); - if (Def && ExpressionRecipesAsSetOfUsers.contains(Def)) - continue; - addOperand(Op); - LiveInPlaceholders.push_back(new VPValue()); - } - } - - // Replace each external operand with the first one created for it in - // LiveInPlaceholders. - for (auto *R : ExpressionRecipes) - for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders)) - R->replaceUsesOfWith(LiveIn, Tmp); -} - -void VPExpressionRecipe::decompose() { - for (auto *R : ExpressionRecipes) - // Since the list could contain duplicates, make sure the recipe hasn't - // already been inserted. - if (!R->getParent()) - R->insertBefore(this); - - for (const auto &[Idx, Op] : enumerate(operands())) - LiveInPlaceholders[Idx]->replaceAllUsesWith(Op); - - replaceAllUsesWith(ExpressionRecipes.back()); - ExpressionRecipes.clear(); -} - -InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Type *RedTy = Ctx.Types.inferScalarType(this); - auto *SrcVecTy = cast( - toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); - assert(RedTy->isIntegerTy() && - "VPExpressionRecipe only supports integer types currently."); - unsigned Opcode = RecurrenceDescriptor::getOpcode( - cast(ExpressionRecipes.back())->getRecurrenceKind()); - switch (ExpressionType) { - case ExpressionTypes::ExtendedReduction: { - unsigned Opcode = RecurrenceDescriptor::getOpcode( - cast(ExpressionRecipes[1])->getRecurrenceKind()); - auto *ExtR = cast(ExpressionRecipes[0]); - - return cast(ExpressionRecipes.back()) - ->isPartialReduction() - ? Ctx.TTI.getPartialReductionCost( - Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, - RedTy, VF, - TargetTransformInfo::getPartialReductionExtendKind( - ExtR->getOpcode()), - TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind) - : Ctx.TTI.getExtendedReductionCost( - Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, - SrcVecTy, std::nullopt, Ctx.CostKind); - } - case ExpressionTypes::MulAccReduction: - return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, - Ctx.CostKind); - - case ExpressionTypes::ExtNegatedMulAccReduction: - assert(Opcode == Instruction::Add && "Unexpected opcode"); - Opcode = Instruction::Sub; - [[fallthrough]]; - case ExpressionTypes::ExtMulAccReduction: { - auto *RedR = cast(ExpressionRecipes.back()); - if (RedR->isPartialReduction()) { - auto *Ext0R = cast(ExpressionRecipes[0]); - auto *Ext1R = cast(ExpressionRecipes[1]); - auto *Mul = cast(ExpressionRecipes[2]); - return Ctx.TTI.getPartialReductionCost( - Opcode, Ctx.Types.inferScalarType(getOperand(0)), - Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF, - TargetTransformInfo::getPartialReductionExtendKind( - Ext0R->getOpcode()), - TargetTransformInfo::getPartialReductionExtendKind( - Ext1R->getOpcode()), - Mul->getOpcode(), Ctx.CostKind); - } - return Ctx.TTI.getMulAccReductionCost( - cast(ExpressionRecipes.front())->getOpcode() == - Instruction::ZExt, - Opcode, RedTy, SrcVecTy, Ctx.CostKind); - } - } - llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); -} - -bool VPExpressionRecipe::mayReadOrWriteMemory() const { - return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) { - return R->mayReadFromMemory() || R->mayWriteToMemory(); - }); -} - -bool VPExpressionRecipe::mayHaveSideEffects() const { - assert( - none_of(ExpressionRecipes, - [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) && - "expression cannot contain recipes with side-effects"); - return false; -} - -bool VPExpressionRecipe::isSingleScalar() const { - // Cannot use vputils::isSingleScalar(), because all external operands - // of the expression will be live-ins while bundled. - auto *RR = dyn_cast(ExpressionRecipes.back()); - return RR && !RR->isPartialReduction(); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - -void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EXPRESSION "; - printAsOperand(O, SlotTracker); - O << " = "; - auto *Red = cast(ExpressionRecipes.back()); - unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - - switch (ExpressionType) { - case ExpressionTypes::ExtendedReduction: { - getOperand(1)->printAsOperand(O, SlotTracker); - O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; - O << Instruction::getOpcodeName(Opcode) << " ("; - getOperand(0)->printAsOperand(O, SlotTracker); - Red->printFlags(O); - - auto *Ext0 = cast(ExpressionRecipes[0]); - O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " - << *Ext0->getResultType(); - if (Red->isConditional()) { - O << ", "; - Red->getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; - break; - } - case ExpressionTypes::ExtNegatedMulAccReduction: { - getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; - O << Instruction::getOpcodeName( - RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) - << " (sub (0, mul"; - auto *Mul = cast(ExpressionRecipes[2]); - Mul->printFlags(O); - O << "("; - getOperand(0)->printAsOperand(O, SlotTracker); - auto *Ext0 = cast(ExpressionRecipes[0]); - O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " - << *Ext0->getResultType() << "), ("; - getOperand(1)->printAsOperand(O, SlotTracker); - auto *Ext1 = cast(ExpressionRecipes[1]); - O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " - << *Ext1->getResultType() << ")"; - if (Red->isConditional()) { - O << ", "; - Red->getCondOp()->printAsOperand(O, SlotTracker); - } - O << "))"; - break; - } - case ExpressionTypes::MulAccReduction: - case ExpressionTypes::ExtMulAccReduction: { - getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce."; - O << Instruction::getOpcodeName( - RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) - << " ("; - O << "mul"; - bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction; - auto *Mul = cast(IsExtended ? ExpressionRecipes[2] - : ExpressionRecipes[0]); - Mul->printFlags(O); - if (IsExtended) - O << "("; - getOperand(0)->printAsOperand(O, SlotTracker); - if (IsExtended) { - auto *Ext0 = cast(ExpressionRecipes[0]); - O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to " - << *Ext0->getResultType() << "), ("; - } else { - O << ", "; - } - getOperand(1)->printAsOperand(O, SlotTracker); - if (IsExtended) { - auto *Ext1 = cast(ExpressionRecipes[1]); - O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to " - << *Ext1->getResultType() << ")"; - } - if (Red->isConditional()) { - O << ", "; - Red->getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; - break; - } - } -} - -void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - if (isPartialReduction()) - O << Indent << "PARTIAL-REDUCE "; - else - O << Indent << "REDUCE "; - printAsOperand(O, SlotTracker); - O << " = "; - getChainOp()->printAsOperand(O, SlotTracker); - O << " +"; - printFlags(O); - O << " reduce." - << Instruction::getOpcodeName( - RecurrenceDescriptor::getOpcode(getRecurrenceKind())) - << " ("; - getVecOp()->printAsOperand(O, SlotTracker); - if (isConditional()) { - O << ", "; - getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; -} - -void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "REDUCE "; - printAsOperand(O, SlotTracker); - O << " = "; - getChainOp()->printAsOperand(O, SlotTracker); - O << " +"; - printFlags(O); - O << " vp.reduce." - << Instruction::getOpcodeName( - RecurrenceDescriptor::getOpcode(getRecurrenceKind())) - << " ("; - getVecOp()->printAsOperand(O, SlotTracker); - O << ", "; - getEVL()->printAsOperand(O, SlotTracker); - if (isConditional()) { - O << ", "; - getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; -} - -#endif - -/// A helper function to scalarize a single Instruction in the innermost loop. -/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue -/// operands from \p RepRecipe instead of \p Instr's operands. -static void scalarizeInstruction(const Instruction *Instr, - VPReplicateRecipe *RepRecipe, - const VPLane &Lane, VPTransformState &State) { - assert((!Instr->getType()->isAggregateType() || - canVectorizeTy(Instr->getType())) && - "Expected vectorizable or non-aggregate type."); - - // Does this instruction return a value ? - bool IsVoidRetTy = Instr->getType()->isVoidTy(); - - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) { - Cloned->setName(Instr->getName() + ".cloned"); - Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe); - // The operands of the replicate recipe may have been narrowed, resulting in - // a narrower result type. Update the type of the cloned instruction to the - // correct type. - if (ResultTy != Cloned->getType()) - Cloned->mutateType(ResultTy); - } - - RepRecipe->applyFlags(*Cloned); - RepRecipe->applyMetadata(*Cloned); - - if (RepRecipe->hasPredicate()) - cast(Cloned)->setPredicate(RepRecipe->getPredicate()); - - if (auto DL = RepRecipe->getDebugLoc()) - State.setDebugLocFrom(DL); - - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (const auto &I : enumerate(RepRecipe->operands())) { - auto InputLane = Lane; - VPValue *Operand = I.value(); - if (vputils::isSingleScalar(Operand)) - InputLane = VPLane::getFirstLane(); - Cloned->setOperand(I.index(), State.get(Operand, InputLane)); - } - - // Place the cloned scalar in the new loop. - State.Builder.Insert(Cloned); - - State.set(RepRecipe, Cloned, Lane); - - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast(Cloned)) - State.AC->registerAssumption(II); - - assert( - (RepRecipe->getRegion() || - !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || - all_of(RepRecipe->operands(), - [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && - "Expected a recipe is either within a region or all of its operands " - "are defined outside the vectorized region."); -} - -void VPReplicateRecipe::execute(VPTransformState &State) { - Instruction *UI = getUnderlyingInstr(); - - if (!State.Lane) { - assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions " - "must have already been unrolled"); - scalarizeInstruction(UI, this, VPLane(0), State); - return; - } - - assert((State.VF.isScalar() || !isSingleScalar()) && - "uniform recipe shouldn't be predicated"); - assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - scalarizeInstruction(UI, this, *State.Lane, State); - // Insert scalar instance packing it into a vector. - if (State.VF.isVector() && shouldPack()) { - Value *WideValue = - State.Lane->isFirstLane() - ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF)) - : State.get(this); - State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, - *State.Lane)); - } -} - -bool VPReplicateRecipe::shouldPack() const { - // Find if the recipe is used by a widened recipe via an intervening - // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. - return any_of(users(), [](const VPUser *U) { - if (auto *PredR = dyn_cast(U)) - return !vputils::onlyScalarValuesUsed(PredR); - return false; - }); -} - -/// Returns a SCEV expression for \p Ptr if it is a pointer computation for -/// which the legacy cost model computes a SCEV expression when computing the -/// address cost. Computing SCEVs for VPValues is incomplete and returns -/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In -/// those cases we fall back to the legacy cost model. Otherwise return nullptr. -static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, - const Loop *L) { - auto *PtrR = Ptr->getDefiningRecipe(); - if (!PtrR || !((isa(Ptr) && - cast(Ptr)->getOpcode() == - Instruction::GetElementPtr) || - isa(Ptr) || - match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) - return nullptr; - - // We are looking for a GEP where all indices are either loop invariant or - // inductions. - for (VPValue *Opd : drop_begin(PtrR->operands())) { - if (!Opd->isDefinedOutsideLoopRegions() && - !isa(Opd)) - return nullptr; - } - - return vputils::getSCEVExprForVPValue(Ptr, SE, L); -} - -/// Returns true if \p V is used as part of the address of another load or -/// store. -static bool isUsedByLoadStoreAddress(const VPUser *V) { - SmallPtrSet Seen; - SmallVector WorkList = {V}; - - while (!WorkList.empty()) { - auto *Cur = dyn_cast(WorkList.pop_back_val()); - if (!Cur || !Seen.insert(Cur).second) - continue; - - auto *Blend = dyn_cast(Cur); - // Skip blends that use V only through a compare by checking if any incoming - // value was already visited. - if (Blend && none_of(seq(0, Blend->getNumIncomingValues()), - [&](unsigned I) { - return Seen.contains( - Blend->getIncomingValue(I)->getDefiningRecipe()); - })) - continue; - - for (VPUser *U : Cur->users()) { - if (auto *InterleaveR = dyn_cast(U)) - if (InterleaveR->getAddr() == Cur) - return true; - if (auto *RepR = dyn_cast(U)) { - if (RepR->getOpcode() == Instruction::Load && - RepR->getOperand(0) == Cur) - return true; - if (RepR->getOpcode() == Instruction::Store && - RepR->getOperand(1) == Cur) - return true; - } - if (auto *MemR = dyn_cast(U)) { - if (MemR->getAddr() == Cur && MemR->isConsecutive()) - return true; - } - } - - // The legacy cost model only supports scalarization loads/stores with phi - // addresses, if the phi is directly used as load/store address. Don't - // traverse further for Blends. - if (Blend) - continue; - - append_range(WorkList, Cur->users()); - } - return false; -} - -InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Instruction *UI = cast(getUnderlyingValue()); - // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan - // transform, avoid computing their cost multiple times for now. - Ctx.SkipCostComputation.insert(UI); - - if (VF.isScalable() && !isSingleScalar()) - return InstructionCost::getInvalid(); - - switch (UI->getOpcode()) { - case Instruction::GetElementPtr: - // We mark this instruction as zero-cost because the cost of GEPs in - // vectorized code depends on whether the corresponding memory instruction - // is scalarized or not. Therefore, we handle GEPs with the memory - // instruction cost. - return 0; - case Instruction::Call: { - auto *CalledFn = - cast(getOperand(getNumOperands() - 1)->getLiveInIRValue()); - - SmallVector ArgOps(drop_end(operands())); - SmallVector Tys; - for (const VPValue *ArgOp : ArgOps) - Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); - - if (CalledFn->isIntrinsic()) - // Various pseudo-intrinsics with costs of 0 are scalarized instead of - // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. - switch (CalledFn->getIntrinsicID()) { - case Intrinsic::assume: - case Intrinsic::lifetime_end: - case Intrinsic::lifetime_start: - case Intrinsic::sideeffect: - case Intrinsic::pseudoprobe: - case Intrinsic::experimental_noalias_scope_decl: { - assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, - ElementCount::getFixed(1), Ctx) == 0 && - "scalarizing intrinsic should be free"); - return InstructionCost(0); - } - default: - break; - } - - Type *ResultTy = Ctx.Types.inferScalarType(this); - InstructionCost ScalarCallCost = - Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); - if (isSingleScalar()) { - if (CalledFn->isIntrinsic()) - ScalarCallCost = std::min( - ScalarCallCost, - getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, - ElementCount::getFixed(1), Ctx)); - return ScalarCallCost; - } - - return ScalarCallCost * VF.getFixedValue() + - Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF); - } - case Instruction::Add: - case Instruction::Sub: - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::ICmp: - case Instruction::FCmp: - return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), - Ctx) * - (isSingleScalar() ? 1 : VF.getFixedValue()); - case Instruction::SDiv: - case Instruction::UDiv: - case Instruction::SRem: - case Instruction::URem: { - InstructionCost ScalarCost = - getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx); - if (isSingleScalar()) - return ScalarCost; - - ScalarCost = ScalarCost * VF.getFixedValue() + - Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this), - to_vector(operands()), VF); - // If the recipe is not predicated (i.e. not in a replicate region), return - // the scalar cost. Otherwise handle predicated cost. - if (!getRegion()->isReplicator()) - return ScalarCost; - - // Account for the phi nodes that we will create. - ScalarCost += VF.getFixedValue() * - Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); - // Scale the cost by the probability of executing the predicated blocks. - // This assumes the predicated block for each vector lane is equally - // likely. - ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent()); - return ScalarCost; - } - case Instruction::Load: - case Instruction::Store: { - // TODO: See getMemInstScalarizationCost for how to handle replicating and - // predicated cases. - const VPRegionBlock *ParentRegion = getRegion(); - if (ParentRegion && ParentRegion->isReplicator()) - break; - - bool IsLoad = UI->getOpcode() == Instruction::Load; - const VPValue *PtrOp = getOperand(!IsLoad); - const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); - if (isa_and_nonnull(PtrSCEV)) - break; - - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = cast(ScalarPtrTy)->getAddressSpace(); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); - - Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); - bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); - bool UsedByLoadStoreAddress = - !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this); - InstructionCost ScalarCost = - ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, - PtrSCEV, Ctx.CostKind); - if (isSingleScalar()) - return ScalarCost; - - SmallVector OpsToScalarize; - Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); - // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we - // don't assign scalarization overhead in general, if the target prefers - // vectorized addressing or the loaded value is used as part of an address - // of another load or store. - if (!UsedByLoadStoreAddress) { - bool EfficientVectorLoadStore = - Ctx.TTI.supportsEfficientVectorElementLoadStore(); - if (!(IsLoad && !PreferVectorizedAddressing) && - !(!IsLoad && EfficientVectorLoadStore)) - append_range(OpsToScalarize, operands()); - - if (!EfficientVectorLoadStore) - ResultTy = Ctx.Types.inferScalarType(this); - } - - return (ScalarCost * VF.getFixedValue()) + - Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); - } - } - - return Ctx.getLegacyCost(UI, VF); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE "); - - if (!getUnderlyingInstr()->getType()->isVoidTy()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - if (auto *CB = dyn_cast(getUnderlyingInstr())) { - O << "call"; - printFlags(O); - O << "@" << CB->getCalledFunction()->getName() << "("; - interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), - O, [&O, &SlotTracker](VPValue *Op) { - Op->printAsOperand(O, SlotTracker); - }); - O << ")"; - } else { - O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); - printFlags(O); - printOperands(O, SlotTracker); - } - - if (shouldPack()) - O << " (S->V)"; -} -#endif - -void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Lane && "Branch on Mask works only on single instance."); - - VPValue *BlockInMask = getOperand(0); - Value *ConditionBit = State.get(BlockInMask, *State.Lane); - - // Replace the temporary unreachable terminator with a new conditional branch, - // whose two destinations will be set later when they are created. - auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional branch."); - auto CondBr = - State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr); - CondBr->setSuccessor(0, nullptr); - CurrentTerminator->eraseFromParent(); -} - -InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - // The legacy cost model doesn't assign costs to branches for individual - // replicate regions. Match the current behavior in the VPlan cost model for - // now. - return 0; -} - -void VPPredInstPHIRecipe::execute(VPTransformState &State) { - assert(State.Lane && "Predicated instruction PHI works per instance."); - Instruction *ScalarPredInst = - cast(State.get(getOperand(0), *State.Lane)); - BasicBlock *PredicatedBB = ScalarPredInst->getParent(); - BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); - assert(PredicatingBB && "Predicated block has no single predecessor."); - assert(isa(getOperand(0)) && - "operand must be VPReplicateRecipe"); - - // By current pack/unpack logic we need to generate only a single phi node: if - // a vector value for the predicated instruction exists at this point it means - // the instruction has vector users only, and a phi for the vector value is - // needed. In this case the recipe of the predicated instruction is marked to - // also do that packing, thereby "hoisting" the insert-element sequence. - // Otherwise, a phi node for the scalar value is needed. - if (State.hasVectorValue(getOperand(0))) { - auto *VecI = cast(State.get(getOperand(0))); - assert((isa(VecI)) && - "Packed operands must generate an insertelement or insertvalue"); - - // If VectorI is a struct, it will be a sequence like: - // %1 = insertvalue %unmodified, %x, 0 - // %2 = insertvalue %1, %y, 1 - // %VectorI = insertvalue %2, %z, 2 - // To get the unmodified vector we need to look through the chain. - if (auto *StructTy = dyn_cast(VecI->getType())) - for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++) - VecI = cast(VecI->getOperand(0)); - - PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2); - VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector. - VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element. - if (State.hasVectorValue(this)) - State.reset(this, VPhi); - else - State.set(this, VPhi); - // NOTE: Currently we need to update the value of the operand, so the next - // predicated iteration inserts its generated value in the correct vector. - State.reset(getOperand(0), VPhi); - } else { - if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane()) - return; - - Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0)); - PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); - Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), - PredicatingBB); - Phi->addIncoming(ScalarPredInst, PredicatedBB); - if (State.hasScalarValue(this, *State.Lane)) - State.reset(this, Phi, *State.Lane); - else - State.set(this, Phi, *State.Lane); - // NOTE: Currently we need to update the value of the operand, so the next - // predicated iteration inserts its generated value in the correct vector. - State.reset(getOperand(0), Phi, *State.Lane); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "PHI-PREDICATED-INSTRUCTION "; - printAsOperand(O, SlotTracker); - O << " = "; - printOperands(O, SlotTracker); -} -#endif - -InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) - ->getAddressSpace(); - unsigned Opcode = isa(this) - ? Instruction::Load - : Instruction::Store; - - if (!Consecutive) { - // TODO: Using the original IR may not be accurate. - // Currently, ARM will use the underlying IR to calculate gather/scatter - // instruction cost. - assert(!Reverse && - "Inconsecutive memory access should not have the order."); - - const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - Type *PtrTy = Ptr->getType(); - - // If the address value is uniform across all lanes, then the address can be - // calculated with scalar type and broadcast. - if (!vputils::isSingleScalar(getAddr())) - PtrTy = toVectorTy(PtrTy, VF); - - unsigned IID = isa(this) ? Intrinsic::masked_gather - : isa(this) ? Intrinsic::masked_scatter - : isa(this) ? Intrinsic::vp_gather - : Intrinsic::vp_scatter; - return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, - Ctx.CostKind) + - Ctx.TTI.getMemIntrinsicInstrCost( - MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment, - &Ingredient), - Ctx.CostKind); - } - - InstructionCost Cost = 0; - if (IsMasked) { - unsigned IID = isa(this) ? Intrinsic::masked_load - : Intrinsic::masked_store; - Cost += Ctx.TTI.getMemIntrinsicInstrCost( - MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind); - } else { - TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( - isa(this) ? getOperand(0) - : getOperand(1)); - Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind, - OpInfo, &Ingredient); - } - if (!Reverse) - return Cost; - - return Cost += Ctx.TTI.getShuffleCost( - TargetTransformInfo::SK_Reverse, cast(Ty), - cast(Ty), {}, Ctx.CostKind, 0); -} - -void VPWidenLoadRecipe::execute(VPTransformState &State) { - Type *ScalarDataTy = getLoadStoreType(&Ingredient); - auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - bool CreateGather = !isConsecutive(); - - auto &Builder = State.Builder; - Value *Mask = nullptr; - if (auto *VPMask = getMask()) { - // Mask reversal is only needed for non-all-one (null) masks, as reverse - // of a null all-one mask is a null mask. - Mask = State.get(VPMask); - if (isReverse()) - Mask = Builder.CreateVectorReverse(Mask, "reverse"); - } - - Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather); - Value *NewLI; - if (CreateGather) { - NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, - "wide.masked.gather"); - } else if (Mask) { - NewLI = - Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, - PoisonValue::get(DataTy), "wide.masked.load"); - } else { - NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); - } - applyMetadata(*cast(NewLI)); - if (Reverse) - NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); - State.set(this, NewLI); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - printAsOperand(O, SlotTracker); - O << " = load "; - printOperands(O, SlotTracker); -} -#endif - -/// Use all-true mask for reverse rather than actual mask, as it avoids a -/// dependence w/o affecting the result. -static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, - Value *EVL, const Twine &Name) { - VectorType *ValTy = cast(Operand->getType()); - Value *AllTrueMask = - Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); - return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, - {Operand, AllTrueMask, EVL}, nullptr, Name); -} - -void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { - Type *ScalarDataTy = getLoadStoreType(&Ingredient); - auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - bool CreateGather = !isConsecutive(); - - auto &Builder = State.Builder; - CallInst *NewLI; - Value *EVL = State.get(getEVL(), VPLane(0)); - Value *Addr = State.get(getAddr(), !CreateGather); - Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { - Mask = State.get(VPMask); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { - Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } - - if (CreateGather) { - NewLI = - Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, - nullptr, "wide.masked.gather"); - } else { - NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load, - {Addr, Mask, EVL}, nullptr, "vp.op.load"); - } - NewLI->addParamAttr( - 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); - applyMetadata(*NewLI); - Instruction *Res = NewLI; - if (isReverse()) - Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); - State.set(this, Res); -} - -InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - if (!Consecutive || IsMasked) - return VPWidenMemoryRecipe::computeCost(VF, Ctx); - - // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() - // here because the EVL recipes using EVL to replace the tail mask. But in the - // legacy model, it will always calculate the cost of mask. - // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we - // don't need to compare to the legacy cost model. - Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) - ->getAddressSpace(); - InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost( - MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS), - Ctx.CostKind); - if (!Reverse) - return Cost; - - return Cost + Ctx.TTI.getShuffleCost( - TargetTransformInfo::SK_Reverse, cast(Ty), - cast(Ty), {}, Ctx.CostKind, 0); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - printAsOperand(O, SlotTracker); - O << " = vp.load "; - printOperands(O, SlotTracker); -} -#endif - -void VPWidenStoreRecipe::execute(VPTransformState &State) { - VPValue *StoredVPValue = getStoredValue(); - bool CreateScatter = !isConsecutive(); - - auto &Builder = State.Builder; - - Value *Mask = nullptr; - if (auto *VPMask = getMask()) { - // Mask reversal is only needed for non-all-one (null) masks, as reverse - // of a null all-one mask is a null mask. - Mask = State.get(VPMask); - if (isReverse()) - Mask = Builder.CreateVectorReverse(Mask, "reverse"); - } - - Value *StoredVal = State.get(StoredVPValue); - if (isReverse()) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter); - Instruction *NewSI = nullptr; - if (CreateScatter) - NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); - else if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); - else - NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); - applyMetadata(*NewSI); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN store "; - printOperands(O, SlotTracker); -} -#endif - -void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { - VPValue *StoredValue = getStoredValue(); - bool CreateScatter = !isConsecutive(); - - auto &Builder = State.Builder; - - CallInst *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue); - Value *EVL = State.get(getEVL(), VPLane(0)); - if (isReverse()) - StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); - Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { - Mask = State.get(VPMask); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { - Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } - Value *Addr = State.get(getAddr(), !CreateScatter); - if (CreateScatter) { - NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), - Intrinsic::vp_scatter, - {StoredVal, Addr, Mask, EVL}); - } else { - NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), - Intrinsic::vp_store, - {StoredVal, Addr, Mask, EVL}); - } - NewSI->addParamAttr( - 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); - applyMetadata(*NewSI); -} - -InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - if (!Consecutive || IsMasked) - return VPWidenMemoryRecipe::computeCost(VF, Ctx); - - // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost() - // here because the EVL recipes using EVL to replace the tail mask. But in the - // legacy model, it will always calculate the cost of mask. - // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we - // don't need to compare to the legacy cost model. - Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); - unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) - ->getAddressSpace(); - InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost( - MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS), - Ctx.CostKind); - if (!Reverse) - return Cost; - - return Cost + Ctx.TTI.getShuffleCost( - TargetTransformInfo::SK_Reverse, cast(Ty), - cast(Ty), {}, Ctx.CostKind, 0); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN vp.store "; - printOperands(O, SlotTracker); -} -#endif - -static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, - VectorType *DstVTy, const DataLayout &DL) { - // Verify that V is a vector type with same number of elements as DstVTy. - auto VF = DstVTy->getElementCount(); - auto *SrcVecTy = cast(V->getType()); - assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); - Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstVTy->getElementType(); - assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && - "Vector elements must have same size"); - - // Do a direct cast if element types are castable. - if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstVTy); - } - // V cannot be directly casted to desired vector type. - // May happen when V is a floating point vector but DstVTy is a vector of - // pointers or vice-versa. Handle this using a two-step bitcast using an - // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. - assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && - "Only one type should be a pointer type"); - assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && - "Only one type should be a floating point type"); - Type *IntTy = - IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - auto *VecIntTy = VectorType::get(IntTy, VF); - Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstVTy); -} - -/// Return a vector containing interleaved elements from multiple -/// smaller input vectors. -static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, - const Twine &Name) { - unsigned Factor = Vals.size(); - assert(Factor > 1 && "Tried to interleave invalid number of vectors"); - - VectorType *VecTy = cast(Vals[0]->getType()); -#ifndef NDEBUG - for (Value *Val : Vals) - assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); -#endif - - // Scalable vectors cannot use arbitrary shufflevectors (only splats), so - // must use intrinsics to interleave. - if (VecTy->isScalableTy()) { - assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors"); - return Builder.CreateVectorInterleave(Vals, Name); - } - - // Fixed length. Start by concatenating all vectors into a wide vector. - Value *WideVec = concatenateVectors(Builder, Vals); - - // Interleave the elements into the wide vector. - const unsigned NumElts = VecTy->getElementCount().getFixedValue(); - return Builder.CreateShuffleVector( - WideVec, createInterleaveMask(NumElts, Factor), Name); -} - -// Try to vectorize the interleave group that \p Instr belongs to. -// -// E.g. Translate following interleaved load group (factor = 3): -// for (i = 0; i < N; i+=3) { -// R = Pic[i]; // Member of index 0 -// G = Pic[i+1]; // Member of index 1 -// B = Pic[i+2]; // Member of index 2 -// ... // do something to R, G, B -// } -// To: -// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B -// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements -// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements -// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements -// -// Or translate following interleaved store group (factor = 3): -// for (i = 0; i < N; i+=3) { -// ... do something to R, G, B -// Pic[i] = R; // Member of index 0 -// Pic[i+1] = G; // Member of index 1 -// Pic[i+2] = B; // Member of index 2 -// } -// To: -// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> -// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> -// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, -// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements -// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void VPInterleaveRecipe::execute(VPTransformState &State) { - assert(!State.Lane && "Interleave group being replicated."); - assert((!needsMaskForGaps() || !State.VF.isScalable()) && - "Masking gaps for scalable vectors is not yet supported."); - const InterleaveGroup *Group = getInterleaveGroup(); - Instruction *Instr = Group->getInsertPos(); - - // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = getLoadStoreType(Instr); - unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); - - VPValue *BlockInMask = getMask(); - VPValue *Addr = getAddr(); - Value *ResAddr = State.get(Addr, VPLane(0)); - - auto CreateGroupMask = [&BlockInMask, &State, - &InterleaveFactor](Value *MaskForGaps) -> Value * { - if (State.VF.isScalable()) { - assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor <= 8 && - "Unsupported deinterleave factor for scalable vectors"); - auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops(InterleaveFactor, ResBlockInMask); - return interleaveVectors(State.Builder, Ops, "interleaved.mask"); - } - - if (!BlockInMask) - return MaskForGaps; - - Value *ResBlockInMask = State.get(BlockInMask); - Value *ShuffledMask = State.Builder.CreateShuffleVector( - ResBlockInMask, - createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()), - "interleaved.mask"); - return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, - ShuffledMask, MaskForGaps) - : ShuffledMask; - }; - - const DataLayout &DL = Instr->getDataLayout(); - // Vectorize the interleaved load group. - if (isa(Instr)) { - Value *MaskForGaps = nullptr; - if (needsMaskForGaps()) { - MaskForGaps = - createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); - assert(MaskForGaps && "Mask for Gaps is required but it is null"); - } - - Instruction *NewLoad; - if (BlockInMask || MaskForGaps) { - Value *GroupMask = CreateGroupMask(MaskForGaps); - Value *PoisonVec = PoisonValue::get(VecTy); - NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr, - Group->getAlign(), GroupMask, - PoisonVec, "wide.masked.vec"); - } else - NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr, - Group->getAlign(), "wide.vec"); - applyMetadata(*NewLoad); - // TODO: Also manage existing metadata using VPIRMetadata. - Group->addMetadata(NewLoad); - - ArrayRef VPDefs = definedValues(); - if (VecTy->isScalableTy()) { - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - assert(InterleaveFactor <= 8 && - "Unsupported deinterleave factor for scalable vectors"); - NewLoad = State.Builder.CreateIntrinsic( - Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), - NewLoad->getType(), NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - } - - auto CreateStridedVector = [&InterleaveFactor, &State, - &NewLoad](unsigned Index) -> Value * { - assert(Index < InterleaveFactor && "Illegal group index"); - if (State.VF.isScalable()) - return State.Builder.CreateExtractValue(NewLoad, Index); - - // For fixed length VF, use shuffle to extract the sub-vectors from the - // wide load. - auto StrideMask = - createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue()); - return State.Builder.CreateShuffleVector(NewLoad, StrideMask, - "strided.vec"); - }; - - for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - // Skip the gaps in the group. - if (!Member) - continue; - - Value *StridedVec = CreateStridedVector(I); - - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = - createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec); - ++J; - } - return; - } - - // The sub vector type for current instruction. - auto *SubVT = VectorType::get(ScalarTy, State.VF); - - // Vectorize the interleaved store group. - Value *MaskForGaps = - createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); - assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && - "Mismatch between NeedsMaskForGaps and MaskForGaps"); - ArrayRef StoredValues = getStoredValues(); - // Collect the stored vector from each member. - SmallVector StoredVecs; - unsigned StoredIdx = 0; - for (unsigned i = 0; i < InterleaveFactor; i++) { - assert((Group->getMember(i) || MaskForGaps) && - "Fail to get a member from an interleaved store group"); - Instruction *Member = Group->getMember(i); - - // Skip the gaps in the group. - if (!Member) { - Value *Undef = PoisonValue::get(SubVT); - StoredVecs.push_back(Undef); - continue; - } - - Value *StoredVec = State.get(StoredValues[StoredIdx]); - ++StoredIdx; - - if (Group->isReverse()) - StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse"); - - // If this member has different type, cast it to a unified type. - - if (StoredVec->getType() != SubVT) - StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); - - StoredVecs.push_back(StoredVec); - } - - // Interleave all the smaller vectors into one wider vector. - Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); - Instruction *NewStoreInstr; - if (BlockInMask || MaskForGaps) { - Value *GroupMask = CreateGroupMask(MaskForGaps); - NewStoreInstr = State.Builder.CreateMaskedStore( - IVec, ResAddr, Group->getAlign(), GroupMask); - } else - NewStoreInstr = - State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign()); - - applyMetadata(*NewStoreInstr); - // TODO: Also manage existing metadata using VPIRMetadata. - Group->addMetadata(NewStoreInstr); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - const InterleaveGroup *IG = getInterleaveGroup(); - O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; - IG->getInsertPos()->printAsOperand(O, false); - O << ", "; - getAddr()->printAsOperand(O, SlotTracker); - VPValue *Mask = getMask(); - if (Mask) { - O << ", "; - Mask->printAsOperand(O, SlotTracker); - } - - unsigned OpIdx = 0; - for (unsigned i = 0; i < IG->getFactor(); ++i) { - if (!IG->getMember(i)) - continue; - if (getNumStoreOperands() > 0) { - O << "\n" << Indent << " store "; - getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); - O << " to index " << i; - } else { - O << "\n" << Indent << " "; - getVPValue(OpIdx)->printAsOperand(O, SlotTracker); - O << " = load from index " << i; - } - ++OpIdx; - } -} -#endif - -void VPInterleaveEVLRecipe::execute(VPTransformState &State) { - assert(!State.Lane && "Interleave group being replicated."); - assert(State.VF.isScalable() && - "Only support scalable VF for EVL tail-folding."); - assert(!needsMaskForGaps() && - "Masking gaps for scalable vectors is not yet supported."); - const InterleaveGroup *Group = getInterleaveGroup(); - Instruction *Instr = Group->getInsertPos(); - - // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = getLoadStoreType(Instr); - unsigned InterleaveFactor = Group->getFactor(); - assert(InterleaveFactor <= 8 && - "Unsupported deinterleave/interleave factor for scalable vectors"); - ElementCount WideVF = State.VF * InterleaveFactor; - auto *VecTy = VectorType::get(ScalarTy, WideVF); - - VPValue *Addr = getAddr(); - Value *ResAddr = State.get(Addr, VPLane(0)); - Value *EVL = State.get(getEVL(), VPLane(0)); - Value *InterleaveEVL = State.Builder.CreateMul( - EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl", - /* NUW= */ true, /* NSW= */ true); - LLVMContext &Ctx = State.Builder.getContext(); - - Value *GroupMask = nullptr; - if (VPValue *BlockInMask = getMask()) { - SmallVector Ops(InterleaveFactor, State.get(BlockInMask)); - GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask"); - } else { - GroupMask = - State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue()); - } - - // Vectorize the interleaved load group. - if (isa(Instr)) { - CallInst *NewLoad = State.Builder.CreateIntrinsic( - VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr, - "wide.vp.load"); - NewLoad->addParamAttr(0, - Attribute::getWithAlignment(Ctx, Group->getAlign())); - - applyMetadata(*NewLoad); - // TODO: Also manage existing metadata using VPIRMetadata. - Group->addMetadata(NewLoad); - - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - NewLoad = State.Builder.CreateIntrinsic( - Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), - NewLoad->getType(), NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - - const DataLayout &DL = Instr->getDataLayout(); - for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - // Skip the gaps in the group. - if (!Member) - continue; - - Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I); - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = - createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); - } - - State.set(getVPValue(J), StridedVec); - ++J; - } - return; - } // End for interleaved load. - - // The sub vector type for current instruction. - auto *SubVT = VectorType::get(ScalarTy, State.VF); - // Vectorize the interleaved store group. - ArrayRef StoredValues = getStoredValues(); - // Collect the stored vector from each member. - SmallVector StoredVecs; - const DataLayout &DL = Instr->getDataLayout(); - for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { - Instruction *Member = Group->getMember(I); - // Skip the gaps in the group. - if (!Member) { - StoredVecs.push_back(PoisonValue::get(SubVT)); - continue; - } - - Value *StoredVec = State.get(StoredValues[StoredIdx]); - // If this member has different type, cast it to a unified type. - if (StoredVec->getType() != SubVT) - StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); - - StoredVecs.push_back(StoredVec); - ++StoredIdx; - } - - // Interleave all the smaller vectors into one wider vector. - Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); - CallInst *NewStore = - State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store, - {IVec, ResAddr, GroupMask, InterleaveEVL}); - NewStore->addParamAttr(1, - Attribute::getWithAlignment(Ctx, Group->getAlign())); - - applyMetadata(*NewStore); - // TODO: Also manage existing metadata using VPIRMetadata. - Group->addMetadata(NewStore); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - const InterleaveGroup *IG = getInterleaveGroup(); - O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; - IG->getInsertPos()->printAsOperand(O, false); - O << ", "; - getAddr()->printAsOperand(O, SlotTracker); - O << ", "; - getEVL()->printAsOperand(O, SlotTracker); - if (VPValue *Mask = getMask()) { - O << ", "; - Mask->printAsOperand(O, SlotTracker); - } - - unsigned OpIdx = 0; - for (unsigned i = 0; i < IG->getFactor(); ++i) { - if (!IG->getMember(i)) - continue; - if (getNumStoreOperands() > 0) { - O << "\n" << Indent << " vp.store "; - getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); - O << " to index " << i; - } else { - O << "\n" << Indent << " "; - getVPValue(OpIdx)->printAsOperand(O, SlotTracker); - O << " = vp.load from index " << i; - } - ++OpIdx; - } -} -#endif - -InstructionCost VPInterleaveBase::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Instruction *InsertPos = getInsertPos(); - // Find the VPValue index of the interleave group. We need to skip gaps. - unsigned InsertPosIdx = 0; - for (unsigned Idx = 0; IG->getFactor(); ++Idx) - if (auto *Member = IG->getMember(Idx)) { - if (Member == InsertPos) - break; - InsertPosIdx++; - } - Type *ValTy = Ctx.Types.inferScalarType( - getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx) - : getStoredValues()[InsertPosIdx]); - auto *VectorTy = cast(toVectorTy(ValTy, VF)); - unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) - ->getAddressSpace(); - - unsigned InterleaveFactor = IG->getFactor(); - auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); - - // Holds the indices of existing members in the interleaved group. - SmallVector Indices; - for (unsigned IF = 0; IF < InterleaveFactor; IF++) - if (IG->getMember(IF)) - Indices.push_back(IF); - - // Calculate the cost of the whole interleaved group. - InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( - InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, - IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); - - if (!IG->isReverse()) - return Cost; - - return Cost + IG->getNumMembers() * - Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - VectorTy, VectorTy, {}, Ctx.CostKind, - 0); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = CANONICAL-INDUCTION "; - printOperands(O, SlotTracker); -} -#endif - -bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { - return vputils::onlyScalarValuesUsed(this) && - (!IsScalable || vputils::onlyFirstLaneUsed(this)); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenPointerInductionRecipe::printRecipe( - raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - assert((getNumOperands() == 3 || getNumOperands() == 5) && - "unexpected number of operands"); - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = WIDEN-POINTER-INDUCTION "; - getStartValue()->printAsOperand(O, SlotTracker); - O << ", "; - getStepValue()->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(2)->printAsOperand(O, SlotTracker); - if (getNumOperands() == 5) { - O << ", "; - getOperand(3)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(4)->printAsOperand(O, SlotTracker); - } -} - -void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = EXPAND SCEV " << *Expr; -} -#endif - -void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true); - Type *STy = CanonicalIV->getType(); - IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - ElementCount VF = State.VF; - Value *VStart = VF.isScalar() - ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); - Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this)); - if (VF.isVector()) { - VStep = Builder.CreateVectorSplat(VF, VStep); - VStep = - Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); - } - Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); - State.set(this, CanonicalVectorIV); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = WIDEN-CANONICAL-INDUCTION "; - printOperands(O, SlotTracker); -} -#endif - -void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - // Create a vector from the initial value. - auto *VectorInit = getStartValue()->getLiveInIRValue(); - - Type *VecTy = State.VF.isScalar() - ? VectorInit->getType() - : VectorType::get(VectorInit->getType(), State.VF); - - BasicBlock *VectorPH = - State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); - if (State.VF.isVector()) { - auto *IdxTy = Builder.getInt32Ty(); - auto *One = ConstantInt::get(IdxTy, 1); - IRBuilder<>::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(VectorPH->getTerminator()); - auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); - auto *LastIdx = Builder.CreateSub(RuntimeVF, One); - VectorInit = Builder.CreateInsertElement( - PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); - } - - // Create a phi node for the new recurrence. - PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur"); - Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt()); - Phi->addIncoming(VectorInit, VectorPH); - State.set(this, Phi); -} - -InstructionCost -VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - if (VF.isScalar()) - return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); - - return 0; -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPFirstOrderRecurrencePHIRecipe::printRecipe( - raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} -#endif - -void VPReductionPHIRecipe::execute(VPTransformState &State) { - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - VPValue *StartVPV = getStartValue(); - - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #1: We create a new vector PHI node with no incoming edges. We'll use - // this value when we vectorize all of the instructions that use the PHI. - BasicBlock *VectorPH = - State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); - bool ScalarPHI = State.VF.isScalar() || isInLoop(); - Value *StartV = State.get(StartVPV, ScalarPHI); - Type *VecTy = StartV->getType(); - - BasicBlock *HeaderBB = State.CFG.PrevBB; - assert(State.CurrentParentLoop->getHeader() == HeaderBB && - "recipe must be in the vector loop header"); - auto *Phi = PHINode::Create(VecTy, 2, "vec.phi"); - Phi->insertBefore(HeaderBB->getFirstInsertionPt()); - State.set(this, Phi, isInLoop()); - - Phi->addIncoming(StartV, VectorPH); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-REDUCTION-PHI "; - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); - if (getVFScaleFactor() > 1) - O << " (VF scaled by 1/" << getVFScaleFactor() << ")"; -} -#endif - -void VPWidenPHIRecipe::execute(VPTransformState &State) { - Value *Op0 = State.get(getOperand(0)); - Type *VecTy = Op0->getType(); - Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name); - State.set(this, VecPhi); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-PHI "; - - printAsOperand(O, SlotTracker); - O << " = phi "; - printPhiOperands(O, SlotTracker); -} -#endif - -void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { - BasicBlock *VectorPH = - State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); - Value *StartMask = State.get(getOperand(0)); - PHINode *Phi = - State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); - Phi->addIncoming(StartMask, VectorPH); - State.set(this, Phi); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "ACTIVE-LANE-MASK-PHI "; - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} -#endif - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} -#endif