diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 3497ff7856eed..28648315df201 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1470,6 +1470,8 @@ QualType Sema::CheckNonTypeTemplateParameterType(QualType T,
       T->isLValueReferenceType() ||
       //   -- pointer to member,
       T->isMemberPointerType() ||
+      //   -- block pointer,
+      T->isBlockPointerType() ||
       //   -- std::nullptr_t, or
       T->isNullPtrType() ||
       //   -- a type that contains a placeholder type.
@@ -7342,7 +7344,7 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
       //   For a non-type template-parameter of pointer or reference type,
       //   the value of the constant expression shall not refer to
       assert(ParamType->isPointerOrReferenceType() ||
-             ParamType->isNullPtrType());
+             ParamType->isNullPtrType() || ParamType->isBlockPointerType());
       // -- a temporary object
       // -- a string literal
       // -- the result of a typeid expression, or
diff --git a/clang/test/SemaCXX/gh189247.cpp b/clang/test/SemaCXX/gh189247.cpp
new file mode 100644
index 0000000000000..79f7b312aee83
--- /dev/null
+++ b/clang/test/SemaCXX/gh189247.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -fblocks -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+template<void (^)(void)> struct T;
+T<nullptr> *t;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
deleted file mode 100644
index 6c7bedaf2c933..0000000000000
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ /dev/null
@@ -1,4435 +0,0 @@
-//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains implementations for different VPlan recipes.
-///
-//===----------------------------------------------------------------------===//
-
-#include "LoopVectorizationPlanner.h"
-#include "VPlan.h"
-#include "VPlanAnalysis.h"
-#include "VPlanHelpers.h"
-#include "VPlanPatternMatch.h"
-#include "VPlanUtils.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include <cassert>
-
-using namespace llvm;
-using namespace llvm::VPlanPatternMatch;
-
-using VectorParts = SmallVector<Value *, 2>;
-
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-
-bool VPRecipeBase::mayWriteToMemory() const {
-  switch (getVPDefID()) {
-  case VPExpressionSC:
-    return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
-  case VPInstructionSC: {
-    auto *VPI = cast<VPInstruction>(this);
-    // Loads read from memory but don't write to memory.
-    if (VPI->getOpcode() == Instruction::Load)
-      return false;
-    return VPI->opcodeMayReadOrWriteFromMemory();
-  }
-  case VPInterleaveEVLSC:
-  case VPInterleaveSC:
-    return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
-  case VPWidenStoreEVLSC:
-  case VPWidenStoreSC:
-    return true;
-  case VPReplicateSC:
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
-        ->mayWriteToMemory();
-  case VPWidenCallSC:
-    return !cast<VPWidenCallRecipe>(this)
-                ->getCalledScalarFunction()
-                ->onlyReadsMemory();
-  case VPWidenIntrinsicSC:
-    return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
-  case VPCanonicalIVPHISC:
-  case VPBranchOnMaskSC:
-  case VPDerivedIVSC:
-  case VPFirstOrderRecurrencePHISC:
-  case VPReductionPHISC:
-  case VPScalarIVStepsSC:
-  case VPPredInstPHISC:
-    return false;
-  case VPBlendSC:
-  case VPReductionEVLSC:
-  case VPReductionSC:
-  case VPVectorPointerSC:
-  case VPWidenCanonicalIVSC:
-  case VPWidenCastSC:
-  case VPWidenGEPSC:
-  case VPWidenIntOrFpInductionSC:
-  case VPWidenLoadEVLSC:
-  case VPWidenLoadSC:
-  case VPWidenPHISC:
-  case VPWidenPointerInductionSC:
-  case VPWidenSC:
-  case VPWidenSelectSC: {
-    const Instruction *I =
-        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
-    (void)I;
-    assert((!I || !I->mayWriteToMemory()) &&
-           "underlying instruction may write to memory");
-    return false;
-  }
-  default:
-    return true;
-  }
-}
-
-bool VPRecipeBase::mayReadFromMemory() const {
-  switch (getVPDefID()) {
-  case VPExpressionSC:
-    return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
-  case VPInstructionSC:
-    return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
-  case VPWidenLoadEVLSC:
-  case VPWidenLoadSC:
-    return true;
-  case VPReplicateSC:
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
-        ->mayReadFromMemory();
-  case VPWidenCallSC:
-    return !cast<VPWidenCallRecipe>(this)
-                ->getCalledScalarFunction()
-                ->onlyWritesMemory();
-  case VPWidenIntrinsicSC:
-    return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
-  case VPBranchOnMaskSC:
-  case VPDerivedIVSC:
-  case VPFirstOrderRecurrencePHISC:
-  case VPPredInstPHISC:
-  case VPScalarIVStepsSC:
-  case VPWidenStoreEVLSC:
-  case VPWidenStoreSC:
-    return false;
-  case VPBlendSC:
-  case VPReductionEVLSC:
-  case VPReductionSC:
-  case VPVectorPointerSC:
-  case VPWidenCanonicalIVSC:
-  case VPWidenCastSC:
-  case VPWidenGEPSC:
-  case VPWidenIntOrFpInductionSC:
-  case VPWidenPHISC:
-  case VPWidenPointerInductionSC:
-  case VPWidenSC:
-  case VPWidenSelectSC: {
-    const Instruction *I =
-        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
-    (void)I;
-    assert((!I || !I->mayReadFromMemory()) &&
-           "underlying instruction may read from memory");
-    return false;
-  }
-  default:
-    // FIXME: Return false if the recipe represents an interleaved store.
-    return true;
-  }
-}
-
-bool VPRecipeBase::mayHaveSideEffects() const {
-  switch (getVPDefID()) {
-  case VPExpressionSC:
-    return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
-  case VPDerivedIVSC:
-  case VPFirstOrderRecurrencePHISC:
-  case VPPredInstPHISC:
-  case VPVectorEndPointerSC:
-    return false;
-  case VPInstructionSC: {
-    auto *VPI = cast<VPInstruction>(this);
-    return mayWriteToMemory() ||
-           VPI->getOpcode() == VPInstruction::BranchOnCount ||
-           VPI->getOpcode() == VPInstruction::BranchOnCond;
-  }
-  case VPWidenCallSC: {
-    Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
-    return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
-  }
-  case VPWidenIntrinsicSC:
-    return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
-  case VPBlendSC:
-  case VPReductionEVLSC:
-  case VPReductionSC:
-  case VPScalarIVStepsSC:
-  case VPVectorPointerSC:
-  case VPWidenCanonicalIVSC:
-  case VPWidenCastSC:
-  case VPWidenGEPSC:
-  case VPWidenIntOrFpInductionSC:
-  case VPWidenPHISC:
-  case VPWidenPointerInductionSC:
-  case VPWidenSC:
-  case VPWidenSelectSC: {
-    const Instruction *I =
-        dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
-    (void)I;
-    assert((!I || !I->mayHaveSideEffects()) &&
-           "underlying instruction has side-effects");
-    return false;
-  }
-  case VPInterleaveEVLSC:
-  case VPInterleaveSC:
-    return mayWriteToMemory();
-  case VPWidenLoadEVLSC:
-  case VPWidenLoadSC:
-  case VPWidenStoreEVLSC:
-  case VPWidenStoreSC:
-    assert(
-        cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
-            mayWriteToMemory() &&
-        "mayHaveSideffects result for ingredient differs from this "
-        "implementation");
-    return mayWriteToMemory();
-  case VPReplicateSC: {
-    auto *R = cast<VPReplicateRecipe>(this);
-    return R->getUnderlyingInstr()->mayHaveSideEffects();
-  }
-  default:
-    return true;
-  }
-}
-
-void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
-  assert(!Parent && "Recipe already in some VPBasicBlock");
-  assert(InsertPos->getParent() &&
-         "Insertion position not in any VPBasicBlock");
-  InsertPos->getParent()->insert(this, InsertPos->getIterator());
-}
-
-void VPRecipeBase::insertBefore(VPBasicBlock &BB,
-                                iplist<VPRecipeBase>::iterator I) {
-  assert(!Parent && "Recipe already in some VPBasicBlock");
-  assert(I == BB.end() || I->getParent() == &BB);
-  BB.insert(this, I);
-}
-
-void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
-  assert(!Parent && "Recipe already in some VPBasicBlock");
-  assert(InsertPos->getParent() &&
-         "Insertion position not in any VPBasicBlock");
-  InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
-}
-
-void VPRecipeBase::removeFromParent() {
-  assert(getParent() && "Recipe not in any VPBasicBlock");
-  getParent()->getRecipeList().remove(getIterator());
-  Parent = nullptr;
-}
-
-iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
-  assert(getParent() && "Recipe not in any VPBasicBlock");
-  return getParent()->getRecipeList().erase(getIterator());
-}
-
-void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
-  removeFromParent();
-  insertAfter(InsertPos);
-}
-
-void VPRecipeBase::moveBefore(VPBasicBlock &BB,
-                              iplist<VPRecipeBase>::iterator I) {
-  removeFromParent();
-  insertBefore(BB, I);
-}
-
-InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
-  // Get the underlying instruction for the recipe, if there is one. It is used
-  // to
-  //   * decide if cost computation should be skipped for this recipe,
-  //   * apply forced target instruction cost.
-  Instruction *UI = nullptr;
-  if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
-    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
-    UI = IG->getInsertPos();
-  else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
-    UI = &WidenMem->getIngredient();
-
-  InstructionCost RecipeCost;
-  if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
-    RecipeCost = 0;
-  } else {
-    RecipeCost = computeCost(VF, Ctx);
-    if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
-        RecipeCost.isValid()) {
-      if (UI)
-        RecipeCost = InstructionCost(ForceTargetInstructionCost);
-      else
-        RecipeCost = InstructionCost(0);
-    }
-  }
-
-  LLVM_DEBUG({
-    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
-    dump();
-  });
-  return RecipeCost;
-}
-
-InstructionCost VPRecipeBase::computeCost(ElementCount VF,
-                                          VPCostContext &Ctx) const {
-  llvm_unreachable("subclasses should implement computeCost");
-}
-
-bool VPRecipeBase::isPhi() const {
-  return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
-         isa<VPPhi, VPIRPhi>(this);
-}
-
-bool VPRecipeBase::isScalarCast() const {
-  auto *VPI = dyn_cast<VPInstruction>(this);
-  return VPI && Instruction::isCast(VPI->getOpcode());
-}
-
-void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
-  assert(OpType == Other.OpType && "OpType must match");
-  switch (OpType) {
-  case OperationType::OverflowingBinOp:
-    WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
-    WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
-    break;
-  case OperationType::Trunc:
-    TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
-    TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
-    break;
-  case OperationType::DisjointOp:
-    DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
-    break;
-  case OperationType::PossiblyExactOp:
-    ExactFlags.IsExact &= Other.ExactFlags.IsExact;
-    break;
-  case OperationType::GEPOp:
-    GEPFlags &= Other.GEPFlags;
-    break;
-  case OperationType::FPMathOp:
-  case OperationType::FCmp:
-    assert((OpType != OperationType::FCmp ||
-            FCmpFlags.Pred == Other.FCmpFlags.Pred) &&
-           "Cannot drop CmpPredicate");
-    getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
-    getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
-    break;
-  case OperationType::NonNegOp:
-    NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
-    break;
-  case OperationType::Cmp:
-    assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
-    break;
-  case OperationType::Other:
-    assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
-    break;
-  }
-}
-
-FastMathFlags VPIRFlags::getFastMathFlags() const {
-  assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp) &&
-         "recipe doesn't have fast math flags");
-  const FastMathFlagsTy &F = getFMFsRef();
-  FastMathFlags Res;
-  Res.setAllowReassoc(F.AllowReassoc);
-  Res.setNoNaNs(F.NoNaNs);
-  Res.setNoInfs(F.NoInfs);
-  Res.setNoSignedZeros(F.NoSignedZeros);
-  Res.setAllowReciprocal(F.AllowReciprocal);
-  Res.setAllowContract(F.AllowContract);
-  Res.setApproxFunc(F.ApproxFunc);
-  return Res;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPSingleDefRecipe::dump() const { VPDef::dump(); }
-
-void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
-                         VPSlotTracker &SlotTracker) const {
-  printRecipe(O, Indent, SlotTracker);
-  if (auto DL = getDebugLoc()) {
-    O << ", !dbg ";
-    DL.print(O);
-  }
-
-  if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
-    Metadata->print(O, SlotTracker);
-}
-#endif
-
-template <unsigned PartOpIdx>
-VPValue *
-VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
-  if (U.getNumOperands() == PartOpIdx + 1)
-    return U.getOperand(PartOpIdx);
-  return nullptr;
-}
-
-template <unsigned PartOpIdx>
-unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
-  if (auto *UnrollPartOp = getUnrollPartOperand(U))
-    return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
-  return 0;
-}
-
-namespace llvm {
-template class VPUnrollPartAccessor<1>;
-template class VPUnrollPartAccessor<2>;
-template class VPUnrollPartAccessor<3>;
-}
-
-VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                             const VPIRFlags &Flags, const VPIRMetadata &MD,
-                             DebugLoc DL, const Twine &Name)
-    : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
-  assert(flagsValidForOpcode(getOpcode()) &&
-         "Set flags not supported for the provided opcode");
-  assert((getNumOperandsForOpcode(Opcode) == -1u ||
-          getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
-         "number of operands does not match opcode");
-}
-
-#ifndef NDEBUG
-unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
-  if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
-    return 1;
-
-  if (Instruction::isBinaryOp(Opcode))
-    return 2;
-
-  switch (Opcode) {
-  case VPInstruction::StepVector:
-  case VPInstruction::VScale:
-    return 0;
-  case Instruction::Alloca:
-  case Instruction::ExtractValue:
-  case Instruction::Freeze:
-  case Instruction::Load:
-  case VPInstruction::BranchOnCond:
-  case VPInstruction::Broadcast:
-  case VPInstruction::BuildStructVector:
-  case VPInstruction::BuildVector:
-  case VPInstruction::CalculateTripCountMinusVF:
-  case VPInstruction::CanonicalIVIncrementForPart:
-  case VPInstruction::ExplicitVectorLength:
-  case VPInstruction::ExtractLastLane:
-  case VPInstruction::ExtractLastPart:
-  case VPInstruction::ExtractPenultimateElement:
-  case VPInstruction::Not:
-  case VPInstruction::ResumeForEpilogue:
-  case VPInstruction::Unpack:
-    return 1;
-  case Instruction::ICmp:
-  case Instruction::FCmp:
-  case Instruction::ExtractElement:
-  case Instruction::Store:
-  case VPInstruction::BranchOnCount:
-  case VPInstruction::ComputeReductionResult:
-  case VPInstruction::ExtractLane:
-  case VPInstruction::FirstOrderRecurrenceSplice:
-  case VPInstruction::LogicalAnd:
-  case VPInstruction::PtrAdd:
-  case VPInstruction::WidePtrAdd:
-  case VPInstruction::WideIVStep:
-    return 2;
-  case Instruction::Select:
-  case VPInstruction::ActiveLaneMask:
-  case VPInstruction::ComputeAnyOfResult:
-  case VPInstruction::ReductionStartVector:
-    return 3;
-  case VPInstruction::ComputeFindIVResult:
-    return 4;
-  case Instruction::Call:
-  case Instruction::GetElementPtr:
-  case Instruction::PHI:
-  case Instruction::Switch:
-  case VPInstruction::AnyOf:
-  case VPInstruction::FirstActiveLane:
-  case VPInstruction::LastActiveLane:
-  case VPInstruction::SLPLoad:
-  case VPInstruction::SLPStore:
-    // Cannot determine the number of operands from the opcode.
-    return -1u;
-  }
-  llvm_unreachable("all cases should be handled above");
-}
-#endif
-
-bool VPInstruction::doesGeneratePerAllLanes() const {
-  return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
-}
-
-bool VPInstruction::canGenerateScalarForFirstLane() const {
-  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
-    return true;
-  if (isSingleScalar() || isVectorToScalar())
-    return true;
-  switch (Opcode) {
-  case Instruction::Freeze:
-  case Instruction::ICmp:
-  case Instruction::PHI:
-  case Instruction::Select:
-  case VPInstruction::BranchOnCond:
-  case VPInstruction::BranchOnCount:
-  case VPInstruction::CalculateTripCountMinusVF:
-  case VPInstruction::CanonicalIVIncrementForPart:
-  case VPInstruction::PtrAdd:
-  case VPInstruction::ExplicitVectorLength:
-  case VPInstruction::AnyOf:
-  case VPInstruction::Not:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// Create a conditional branch using \p Cond branching to the successors of \p
-/// VPBB. Note that the first successor is always forward (i.e. not created yet)
-/// while the second successor may already have been created (if it is a header
-/// block and VPBB is a latch).
-static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
-                                    VPTransformState &State) {
-  // Replace the temporary unreachable terminator with a new conditional
-  // branch, hooking it up to backward destination (header) for latch blocks
-  // now, and to forward destination(s) later when they are created.
-  // Second successor may be backwards - iff it is already in VPBB2IRBB.
-  VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
-  BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
-  BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
-  BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
-  // First successor is always forward, reset it to nullptr
-  CondBr->setSuccessor(0, nullptr);
-  IRBB->getTerminator()->eraseFromParent();
-  return CondBr;
-}
-
-Value *VPInstruction::generate(VPTransformState &State) {
-  IRBuilderBase &Builder = State.Builder;
-
-  if (Instruction::isBinaryOp(getOpcode())) {
-    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
-    Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
-    Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
-    auto *Res =
-        Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
-    if (auto *I = dyn_cast<Instruction>(Res))
-      applyFlags(*I);
-    return Res;
-  }
-
-  switch (getOpcode()) {
-  case VPInstruction::Not: {
-    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
-    Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
-    return Builder.CreateNot(A, Name);
-  }
-  case Instruction::ExtractElement: {
-    assert(State.VF.isVector() && "Only extract elements from vectors");
-    if (getOperand(1)->isLiveIn()) {
-      unsigned IdxToExtract =
-          cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
-      return State.get(getOperand(0), VPLane(IdxToExtract));
-    }
-    Value *Vec = State.get(getOperand(0));
-    Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
-    return Builder.CreateExtractElement(Vec, Idx, Name);
-  }
-  case Instruction::Freeze: {
-    Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
-    return Builder.CreateFreeze(Op, Name);
-  }
-  case Instruction::FCmp:
-  case Instruction::ICmp: {
-    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
-    Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
-    Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
-    return Builder.CreateCmp(getPredicate(), A, B, Name);
-  }
-  case Instruction::PHI: {
-    llvm_unreachable("should be handled by VPPhi::execute");
-  }
-  case Instruction::Select: {
-    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
-    Value *Cond =
-        State.get(getOperand(0),
-                  OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
-    Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
-    Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
-    return Builder.CreateSelect(Cond, Op1, Op2, Name);
-  }
-  case VPInstruction::ActiveLaneMask: {
-    // Get first lane of vector induction variable.
-    Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
-    // Get the original loop tripcount.
-    Value *ScalarTC = State.get(getOperand(1), VPLane(0));
-
-    // If this part of the active lane mask is scalar, generate the CMP directly
-    // to avoid unnecessary extracts.
-    if (State.VF.isScalar())
-      return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
-                               Name);
-
-    ElementCount EC = State.VF.multiplyCoefficientBy(
-        cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
-    auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
-    return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
-                                   {PredTy, ScalarTC->getType()},
-                                   {VIVElem0, ScalarTC}, nullptr, Name);
-  }
-  case VPInstruction::FirstOrderRecurrenceSplice: {
-    // Generate code to combine the previous and current values in vector v3.
-    //
-    //   vector.ph:
-    //     v_init = vector(..., ..., ..., a[-1])
-    //     br vector.body
-    //
-    //   vector.body
-    //     i = phi [0, vector.ph], [i+4, vector.body]
-    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
-    //     v2 = a[i, i+1, i+2, i+3];
-    //     v3 = vector(v1(3), v2(0, 1, 2))
-
-    auto *V1 = State.get(getOperand(0));
-    if (!V1->getType()->isVectorTy())
-      return V1;
-    Value *V2 = State.get(getOperand(1));
-    return Builder.CreateVectorSplice(V1, V2, -1, Name);
-  }
-  case VPInstruction::CalculateTripCountMinusVF: {
-    unsigned UF = getParent()->getPlan()->getUF();
-    Value *ScalarTC = State.get(getOperand(0), VPLane(0));
-    Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
-    Value *Sub = Builder.CreateSub(ScalarTC, Step);
-    Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
-    Value *Zero = ConstantInt::getNullValue(ScalarTC->getType());
-    return Builder.CreateSelect(Cmp, Sub, Zero);
-  }
-  case VPInstruction::ExplicitVectorLength: {
-    // TODO: Restructure this code with an explicit remainder loop, vsetvli can
-    // be outside of the main loop.
-    Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
-    // Compute EVL
-    assert(AVL->getType()->isIntegerTy() &&
-           "Requested vector length should be an integer.");
-
-    assert(State.VF.isScalable() && "Expected scalable vector factor.");
-    Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
-
-    Value *EVL = Builder.CreateIntrinsic(
-        Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
-        {AVL, VFArg, Builder.getTrue()});
-    return EVL;
-  }
-  case VPInstruction::CanonicalIVIncrementForPart: {
-    unsigned Part = getUnrollPart(*this);
-    auto *IV = State.get(getOperand(0), VPLane(0));
-    assert(Part != 0 && "Must have a positive part");
-    // The canonical IV is incremented by the vectorization factor (num of
-    // SIMD elements) times the unroll part.
-    Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
-    return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
-                             hasNoSignedWrap());
-  }
-  case VPInstruction::BranchOnCond: {
-    Value *Cond = State.get(getOperand(0), VPLane(0));
-    auto *Br = createCondBranch(Cond, getParent(), State);
-    applyMetadata(*Br);
-    return Br;
-  }
-  case VPInstruction::BranchOnCount: {
-    // First create the compare.
-    Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
-    Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
-    Value *Cond = Builder.CreateICmpEQ(IV, TC);
-    return createCondBranch(Cond, getParent(), State);
-  }
-  case VPInstruction::Broadcast: {
-    return Builder.CreateVectorSplat(
-        State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
-  }
-  case VPInstruction::BuildStructVector: {
-    // For struct types, we need to build a new 'wide' struct type, where each
-    // element is widened, i.e., we create a struct of vectors.
-    auto *StructTy =
-        cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
-    Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
-    for (const auto &[LaneIndex, Op] : enumerate(operands())) {
-      for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
-           FieldIndex++) {
-        Value *ScalarValue =
-            Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
-        Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
-        VectorValue =
-            Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
-        Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
-      }
-    }
-    return Res;
-  }
-  case VPInstruction::BuildVector: {
-    auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
-    auto NumOfElements = ElementCount::getFixed(getNumOperands());
-    Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
-    for (const auto &[Idx, Op] : enumerate(operands()))
-      Res = Builder.CreateInsertElement(Res, State.get(Op, true),
-                                        Builder.getInt32(Idx));
-    return Res;
-  }
-  case VPInstruction::ReductionStartVector: {
-    if (State.VF.isScalar())
-      return State.get(getOperand(0), true);
-    IRBuilderBase::FastMathFlagGuard FMFG(Builder);
-    Builder.setFastMathFlags(getFastMathFlags());
-    // If this start vector is scaled then it should produce a vector with fewer
-    // elements than the VF.
-    ElementCount VF = State.VF.divideCoefficientBy(
-        cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
-    auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
-    return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
-                                       Builder.getInt32(0));
-  }
-  case VPInstruction::ComputeAnyOfResult: {
-    // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
-    // and will be removed by breaking up the recipe further.
-    auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
-    auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
-    Value *ReducedPartRdx = State.get(getOperand(2));
-    for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
-      ReducedPartRdx =
-          Builder.CreateBinOp(Instruction::Or, State.get(getOperand(Idx)),
-                              ReducedPartRdx, "bin.rdx");
-    return createAnyOfReduction(Builder, ReducedPartRdx,
-                                State.get(getOperand(1), VPLane(0)), OrigPhi);
-  }
-  case VPInstruction::ComputeFindIVResult: {
-    // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
-    // and will be removed by breaking up the recipe further.
-    auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
-    // Get its reduction variable descriptor.
-    RecurKind RK = PhiR->getRecurrenceKind();
-    assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
-           "Unexpected reduction kind");
-    assert(!PhiR->isInLoop() &&
-           "In-loop FindLastIV reduction is not supported yet");
-
-    // The recipe's operands are the reduction phi, the start value, the
-    // sentinel value, followed by one operand for each part of the reduction.
-    unsigned UF = getNumOperands() - 3;
-    Value *ReducedPartRdx = State.get(getOperand(3));
-    RecurKind MinMaxKind;
-    bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
-    if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
-      MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
-    else
-      MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
-    for (unsigned Part = 1; Part < UF; ++Part)
-      ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
-                                      State.get(getOperand(3 + Part)));
-
-    Value *Start = State.get(getOperand(1), true);
-    Value *Sentinel = getOperand(2)->getLiveInIRValue();
-    return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
-                                     Sentinel);
-  }
-  case VPInstruction::ComputeReductionResult: {
-    // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
-    // and will be removed by breaking up the recipe further.
-    auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
-    // Get its reduction variable descriptor.
-
-    RecurKind RK = PhiR->getRecurrenceKind();
-    assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
-           "should be handled by ComputeFindIVResult");
-
-    // The recipe's operands are the reduction phi, followed by one operand for
-    // each part of the reduction.
-    unsigned UF = getNumOperands() - 1;
-    VectorParts RdxParts(UF);
-    for (unsigned Part = 0; Part < UF; ++Part)
-      RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
-
-    IRBuilderBase::FastMathFlagGuard FMFG(Builder);
-    if (hasFastMathFlags())
-      Builder.setFastMathFlags(getFastMathFlags());
-
-    // Reduce all of the unrolled parts into a single vector.
-    Value *ReducedPartRdx = RdxParts[0];
-    if (PhiR->isOrdered()) {
-      ReducedPartRdx = RdxParts[UF - 1];
-    } else {
-      // Floating-point operations should have some FMF to enable the reduction.
-      for (unsigned Part = 1; Part < UF; ++Part) {
-        Value *RdxPart = RdxParts[Part];
-        if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
-          ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
-        else {
-          // For sub-recurrences, each UF's reduction variable is already
-          // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
-          Instruction::BinaryOps Opcode =
-              RK == RecurKind::Sub
-                  ? Instruction::Add
-                  : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
-          ReducedPartRdx =
-              Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
-        }
-      }
-    }
-
-    // Create the reduction after the loop. Note that inloop reductions create
-    // the target reduction in the loop using a Reduction recipe.
-    if (State.VF.isVector() && !PhiR->isInLoop()) {
-      // TODO: Support in-order reductions based on the recurrence descriptor.
-      // All ops in the reduction inherit fast-math-flags from the recurrence
-      // descriptor.
-      ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
-    }
-
-    return ReducedPartRdx;
-  }
-  case VPInstruction::ExtractLastLane:
-  case VPInstruction::ExtractPenultimateElement: {
-    unsigned Offset =
-        getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
-    Value *Res;
-    if (State.VF.isVector()) {
-      assert(Offset <= State.VF.getKnownMinValue() &&
-             "invalid offset to extract from");
-      // Extract lane VF - Offset from the operand.
-      Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
-    } else {
-      // TODO: Remove ExtractLastLane for scalar VFs.
-      assert(Offset <= 1 && "invalid offset to extract from");
-      Res = State.get(getOperand(0));
-    }
-    if (isa<ExtractElementInst>(Res))
-      Res->setName(Name);
-    return Res;
-  }
-  case VPInstruction::LogicalAnd: {
-    Value *A = State.get(getOperand(0));
-    Value *B = State.get(getOperand(1));
-    return Builder.CreateLogicalAnd(A, B, Name);
-  }
-  case VPInstruction::PtrAdd: {
-    assert(vputils::onlyFirstLaneUsed(this) &&
-           "can only generate first lane for PtrAdd");
-    Value *Ptr = State.get(getOperand(0), VPLane(0));
-    Value *Addend = State.get(getOperand(1), VPLane(0));
-    return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
-  }
-  case VPInstruction::WidePtrAdd: {
-    Value *Ptr =
-        State.get(getOperand(0), vputils::isSingleScalar(getOperand(0)));
-    Value *Addend = State.get(getOperand(1));
-    return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
-  }
-  case VPInstruction::AnyOf: {
-    Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
-    for (VPValue *Op : drop_begin(operands()))
-      Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
-    return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
-  }
-  case VPInstruction::ExtractLane: {
-    Value *LaneToExtract = State.get(getOperand(0), true);
-    Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
-    Value *Res = nullptr;
-    Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
-
-    for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
-      Value *VectorStart =
-          Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
-      Value *VectorIdx = Idx == 1
-                             ? LaneToExtract
-                             : Builder.CreateSub(LaneToExtract, VectorStart);
-      Value *Ext = State.VF.isScalar()
-                       ? State.get(getOperand(Idx))
-                       : Builder.CreateExtractElement(
-                             State.get(getOperand(Idx)), VectorIdx);
-      if (Res) {
-        Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
-        Res = Builder.CreateSelect(Cmp, Ext, Res);
-      } else {
-        Res = Ext;
-      }
-    }
-    return Res;
-  }
-  case VPInstruction::FirstActiveLane: {
-    if (getNumOperands() == 1) {
-      Value *Mask = State.get(getOperand(0));
-      return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
-                                                  /*ZeroIsPoison=*/false, Name);
-    }
-    // If there are multiple operands, create a chain of selects to pick the
-    // first operand with an active lane and add the number of lanes of the
-    // preceding operands.
-    Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt64Ty(), State.VF);
-    unsigned LastOpIdx = getNumOperands() - 1;
-    Value *Res = nullptr;
-    for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
-      Value *TrailingZeros =
-          State.VF.isScalar()
-              ? Builder.CreateZExt(
-                    Builder.CreateICmpEQ(State.get(getOperand(Idx)),
-                                         Builder.getFalse()),
-                    Builder.getInt64Ty())
-              : Builder.CreateCountTrailingZeroElems(
-                    Builder.getInt64Ty(), State.get(getOperand(Idx)),
-                    /*ZeroIsPoison=*/false, Name);
-      Value *Current = Builder.CreateAdd(
-          Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
-      if (Res) {
-        Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
-        Res = Builder.CreateSelect(Cmp, Current, Res);
-      } else {
-        Res = Current;
-      }
-    }
-
-    return Res;
-  }
-  case VPInstruction::ResumeForEpilogue:
-    return State.get(getOperand(0), true);
-  default:
-    llvm_unreachable("Unsupported opcode for instruction");
-  }
-}
-
-InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
-    unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
-  Type *ScalarTy = Ctx.Types.inferScalarType(this);
-  Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
-  switch (Opcode) {
-  case Instruction::FNeg:
-    return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::URem:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::FDiv:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    TargetTransformInfo::OperandValueInfo RHSInfo = {
-        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
-
-    if (VF.isVector()) {
-      // Certain instructions can be cheaper to vectorize if they have a
-      // constant second vector operand. One example of this are shifts on x86.
-      VPValue *RHS = getOperand(1);
-      RHSInfo = Ctx.getOperandInfo(RHS);
-
-      if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
-          getOperand(1)->isDefinedOutsideLoopRegions())
-        RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
-    }
-
-    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
-    SmallVector<const Value *, 4> Operands;
-    if (CtxI)
-      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
-    return Ctx.TTI.getArithmeticInstrCost(
-        Opcode, ResultTy, Ctx.CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        RHSInfo, Operands, CtxI, &Ctx.TLI);
-  }
-  case Instruction::Freeze:
-    // This opcode is unknown. Assume that it is the same as 'mul'.
-    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
-                                          Ctx.CostKind);
-  case Instruction::ExtractValue:
-    return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
-                                             Ctx.CostKind);
-  case Instruction::ICmp:
-  case Instruction::FCmp: {
-    Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
-    Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
-    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
-    return Ctx.TTI.getCmpSelInstrCost(
-        Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
-        Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
-        {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
-  }
-  }
-  llvm_unreachable("called for unsupported opcode");
-}
-
-InstructionCost VPInstruction::computeCost(ElementCount VF,
-                                           VPCostContext &Ctx) const {
-  if (Instruction::isBinaryOp(getOpcode())) {
-    if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
-      // TODO: Compute cost for VPInstructions without underlying values once
-      // the legacy cost model has been retired.
-      return 0;
-    }
-
-    assert(!doesGeneratePerAllLanes() &&
-           "Should only generate a vector value or single scalar, not scalars "
-           "for all lanes.");
-    return getCostForRecipeWithOpcode(
-        getOpcode(),
-        vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
-  }
-
-  switch (getOpcode()) {
-  case Instruction::Select: {
-    llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
-    match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
-    auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
-    auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
-    if (!vputils::onlyFirstLaneUsed(this)) {
-      CondTy = toVectorTy(CondTy, VF);
-      VecTy = toVectorTy(VecTy, VF);
-    }
-    return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
-                                      Ctx.CostKind);
-  }
-  case Instruction::ExtractElement:
-  case VPInstruction::ExtractLane: {
-    if (VF.isScalar()) {
-      // ExtractLane with VF=1 takes care of handling extracting across multiple
-      // parts.
-      return 0;
-    }
-
-    // Add on the cost of extracting the element.
-    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
-    return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                      Ctx.CostKind);
-  }
-  case VPInstruction::AnyOf: {
-    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-    return Ctx.TTI.getArithmeticReductionCost(
-        Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
-  }
-  case VPInstruction::FirstActiveLane: {
-    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
-    if (VF.isScalar())
-      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
-                                        CmpInst::makeCmpResultType(ScalarTy),
-                                        CmpInst::ICMP_EQ, Ctx.CostKind);
-    // Calculate the cost of determining the lane index.
-    auto *PredTy = toVectorTy(ScalarTy, VF);
-    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
-                                  Type::getInt64Ty(Ctx.LLVMCtx),
-                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
-    return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
-  }
-  case VPInstruction::LastActiveLane: {
-    Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
-    if (VF.isScalar())
-      return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
-                                        CmpInst::makeCmpResultType(ScalarTy),
-                                        CmpInst::ICMP_EQ, Ctx.CostKind);
-    // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
-    auto *PredTy = toVectorTy(ScalarTy, VF);
-    IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
-                                  Type::getInt64Ty(Ctx.LLVMCtx),
-                                  {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
-    InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
-    // Add cost of NOT operation on the predicate.
-    Cost += Ctx.TTI.getArithmeticInstrCost(
-        Instruction::Xor, PredTy, Ctx.CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        {TargetTransformInfo::OK_UniformConstantValue,
-         TargetTransformInfo::OP_None});
-    // Add cost of SUB operation on the index.
-    Cost += Ctx.TTI.getArithmeticInstrCost(
-        Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind);
-    return Cost;
-  }
-  case VPInstruction::FirstOrderRecurrenceSplice: {
-    assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
-    SmallVector<int> Mask(VF.getKnownMinValue());
-    std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
-    Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-
-    return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
-                                  cast<VectorType>(VectorTy),
-                                  cast<VectorType>(VectorTy), Mask,
-                                  Ctx.CostKind, VF.getKnownMinValue() - 1);
-  }
-  case VPInstruction::ActiveLaneMask: {
-    Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
-    unsigned Multiplier =
-        cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
-    Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
-    IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
-                                  {ArgTy, ArgTy});
-    return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
-  }
-  case VPInstruction::ExplicitVectorLength: {
-    Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
-    Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
-    Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
-    IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
-                                  I32Ty, {Arg0Ty, I32Ty, I1Ty});
-    return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
-  }
-  case VPInstruction::ExtractLastLane: {
-    // Add on the cost of extracting the element.
-    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
-    return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
-                                                    VecTy, Ctx.CostKind, 0);
-  }
-  case VPInstruction::ExtractPenultimateElement:
-    if (VF == ElementCount::getScalable(1))
-      return InstructionCost::getInvalid();
-    [[fallthrough]];
-  default:
-    // TODO: Compute cost other VPInstructions once the legacy cost model has
-    // been retired.
-    assert(!getUnderlyingValue() &&
-           "unexpected VPInstruction witht underlying value");
-    return 0;
-  }
-}
-
-bool VPInstruction::isVectorToScalar() const {
-  return getOpcode() == VPInstruction::ExtractLastLane ||
-         getOpcode() == VPInstruction::ExtractPenultimateElement ||
-         getOpcode() == Instruction::ExtractElement ||
-         getOpcode() == VPInstruction::ExtractLane ||
-         getOpcode() == VPInstruction::FirstActiveLane ||
-         getOpcode() == VPInstruction::LastActiveLane ||
-         getOpcode() == VPInstruction::ComputeAnyOfResult ||
-         getOpcode() == VPInstruction::ComputeFindIVResult ||
-         getOpcode() == VPInstruction::ComputeReductionResult ||
-         getOpcode() == VPInstruction::AnyOf;
-}
-
-bool VPInstruction::isSingleScalar() const {
-  switch (getOpcode()) {
-  case Instruction::PHI:
-  case VPInstruction::ExplicitVectorLength:
-  case VPInstruction::ResumeForEpilogue:
-  case VPInstruction::VScale:
-    return true;
-  default:
-    return isScalarCast();
-  }
-}
-
-void VPInstruction::execute(VPTransformState &State) {
-  assert(!State.Lane && "VPInstruction executing an Lane");
-  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
-  assert(flagsValidForOpcode(getOpcode()) &&
-         "Set flags not supported for the provided opcode");
-  if (hasFastMathFlags())
-    State.Builder.setFastMathFlags(getFastMathFlags());
-  Value *GeneratedValue = generate(State);
-  if (!hasResult())
-    return;
-  assert(GeneratedValue && "generate must produce a value");
-  bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
-                                   (vputils::onlyFirstLaneUsed(this) ||
-                                    isVectorToScalar() || isSingleScalar());
-  assert((((GeneratedValue->getType()->isVectorTy() ||
-            GeneratedValue->getType()->isStructTy()) ==
-           !GeneratesPerFirstLaneOnly) ||
-          State.VF.isScalar()) &&
-         "scalar value but not only first lane defined");
-  State.set(this, GeneratedValue,
-            /*IsScalar*/ GeneratesPerFirstLaneOnly);
-}
-
-bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
-  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
-    return false;
-  switch (getOpcode()) {
-  case Instruction::GetElementPtr:
-  case Instruction::ExtractElement:
-  case Instruction::Freeze:
-  case Instruction::FCmp:
-  case Instruction::ICmp:
-  case Instruction::Select:
-  case Instruction::PHI:
-  case VPInstruction::AnyOf:
-  case VPInstruction::BranchOnCond:
-  case VPInstruction::BranchOnCount:
-  case VPInstruction::Broadcast:
-  case VPInstruction::BuildStructVector:
-  case VPInstruction::BuildVector:
-  case VPInstruction::CalculateTripCountMinusVF:
-  case VPInstruction::CanonicalIVIncrementForPart:
-  case VPInstruction::ExtractLane:
-  case VPInstruction::ExtractLastLane:
-  case VPInstruction::ExtractLastPart:
-  case VPInstruction::ExtractPenultimateElement:
-  case VPInstruction::ActiveLaneMask:
-  case VPInstruction::ExplicitVectorLength:
-  case VPInstruction::FirstActiveLane:
-  case VPInstruction::LastActiveLane:
-  case VPInstruction::FirstOrderRecurrenceSplice:
-  case VPInstruction::LogicalAnd:
-  case VPInstruction::Not:
-  case VPInstruction::PtrAdd:
-  case VPInstruction::WideIVStep:
-  case VPInstruction::WidePtrAdd:
-  case VPInstruction::StepVector:
-  case VPInstruction::ReductionStartVector:
-  case VPInstruction::VScale:
-  case VPInstruction::Unpack:
-    return false;
-  default:
-    return true;
-  }
-}
-
-bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
-  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
-  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
-    return vputils::onlyFirstLaneUsed(this);
-
-  switch (getOpcode()) {
-  default:
-    return false;
-  case Instruction::ExtractElement:
-    return Op == getOperand(1);
-  case Instruction::PHI:
-    return true;
-  case Instruction::FCmp:
-  case Instruction::ICmp:
-  case Instruction::Select:
-  case Instruction::Or:
-  case Instruction::Freeze:
-  case VPInstruction::Not:
-    // TODO: Cover additional opcodes.
-    return vputils::onlyFirstLaneUsed(this);
-  case VPInstruction::ActiveLaneMask:
-  case VPInstruction::ExplicitVectorLength:
-  case VPInstruction::CalculateTripCountMinusVF:
-  case VPInstruction::CanonicalIVIncrementForPart:
-  case VPInstruction::BranchOnCount:
-  case VPInstruction::BranchOnCond:
-  case VPInstruction::Broadcast:
-  case VPInstruction::ReductionStartVector:
-    return true;
-  case VPInstruction::BuildStructVector:
-  case VPInstruction::BuildVector:
-    // Before replicating by VF, Build(Struct)Vector uses all lanes of the
-    // operand, after replicating its operands only the first lane is used.
-    // Before replicating, it will have only a single operand.
-    return getNumOperands() > 1;
-  case VPInstruction::PtrAdd:
-    return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
-  case VPInstruction::WidePtrAdd:
-    // WidePtrAdd supports scalar and vector base addresses.
-    return false;
-  case VPInstruction::ComputeAnyOfResult:
-  case VPInstruction::ComputeFindIVResult:
-    return Op == getOperand(1);
-  case VPInstruction::ExtractLane:
-    return Op == getOperand(0);
-  };
-  llvm_unreachable("switch should return");
-}
-
-bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
-  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
-  if (Instruction::isBinaryOp(getOpcode()))
-    return vputils::onlyFirstPartUsed(this);
-
-  switch (getOpcode()) {
-  default:
-    return false;
-  case Instruction::FCmp:
-  case Instruction::ICmp:
-  case Instruction::Select:
-    return vputils::onlyFirstPartUsed(this);
-  case VPInstruction::BranchOnCount:
-  case VPInstruction::BranchOnCond:
-  case VPInstruction::CanonicalIVIncrementForPart:
-    return true;
-  };
-  llvm_unreachable("switch should return");
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInstruction::dump() const {
-  VPSlotTracker SlotTracker(getParent()->getPlan());
-  printRecipe(dbgs(), "", SlotTracker);
-}
-
-void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
-
-  if (hasResult()) {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-
-  switch (getOpcode()) {
-  case VPInstruction::Not:
-    O << "not";
-    break;
-  case VPInstruction::SLPLoad:
-    O << "combined load";
-    break;
-  case VPInstruction::SLPStore:
-    O << "combined store";
-    break;
-  case VPInstruction::ActiveLaneMask:
-    O << "active lane mask";
-    break;
-  case VPInstruction::ExplicitVectorLength:
-    O << "EXPLICIT-VECTOR-LENGTH";
-    break;
-  case VPInstruction::FirstOrderRecurrenceSplice:
-    O << "first-order splice";
-    break;
-  case VPInstruction::BranchOnCond:
-    O << "branch-on-cond";
-    break;
-  case VPInstruction::CalculateTripCountMinusVF:
-    O << "TC > VF ? TC - VF : 0";
-    break;
-  case VPInstruction::CanonicalIVIncrementForPart:
-    O << "VF * Part +";
-    break;
-  case VPInstruction::BranchOnCount:
-    O << "branch-on-count";
-    break;
-  case VPInstruction::Broadcast:
-    O << "broadcast";
-    break;
-  case VPInstruction::BuildStructVector:
-    O << "buildstructvector";
-    break;
-  case VPInstruction::BuildVector:
-    O << "buildvector";
-    break;
-  case VPInstruction::ExtractLane:
-    O << "extract-lane";
-    break;
-  case VPInstruction::ExtractLastLane:
-    O << "extract-last-lane";
-    break;
-  case VPInstruction::ExtractLastPart:
-    O << "extract-last-part";
-    break;
-  case VPInstruction::ExtractPenultimateElement:
-    O << "extract-penultimate-element";
-    break;
-  case VPInstruction::ComputeAnyOfResult:
-    O << "compute-anyof-result";
-    break;
-  case VPInstruction::ComputeFindIVResult:
-    O << "compute-find-iv-result";
-    break;
-  case VPInstruction::ComputeReductionResult:
-    O << "compute-reduction-result";
-    break;
-  case VPInstruction::LogicalAnd:
-    O << "logical-and";
-    break;
-  case VPInstruction::PtrAdd:
-    O << "ptradd";
-    break;
-  case VPInstruction::WidePtrAdd:
-    O << "wide-ptradd";
-    break;
-  case VPInstruction::AnyOf:
-    O << "any-of";
-    break;
-  case VPInstruction::FirstActiveLane:
-    O << "first-active-lane";
-    break;
-  case VPInstruction::LastActiveLane:
-    O << "last-active-lane";
-    break;
-  case VPInstruction::ReductionStartVector:
-    O << "reduction-start-vector";
-    break;
-  case VPInstruction::ResumeForEpilogue:
-    O << "resume-for-epilogue";
-    break;
-  case VPInstruction::Unpack:
-    O << "unpack";
-    break;
-  default:
-    O << Instruction::getOpcodeName(getOpcode());
-  }
-
-  printFlags(O);
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPInstructionWithType::execute(VPTransformState &State) {
-  State.setDebugLocFrom(getDebugLoc());
-  if (isScalarCast()) {
-    Value *Op = State.get(getOperand(0), VPLane(0));
-    Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
-                                           Op, ResultTy);
-    State.set(this, Cast, VPLane(0));
-    return;
-  }
-  switch (getOpcode()) {
-  case VPInstruction::StepVector: {
-    Value *StepVector =
-        State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
-    State.set(this, StepVector);
-    break;
-  }
-  case VPInstruction::VScale: {
-    Value *VScale = State.Builder.CreateVScale(ResultTy);
-    State.set(this, VScale, true);
-    break;
-  }
-
-  default:
-    llvm_unreachable("opcode not implemented yet");
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
-                                        VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-
-  switch (getOpcode()) {
-  case VPInstruction::WideIVStep:
-    O << "wide-iv-step ";
-    printOperands(O, SlotTracker);
-    break;
-  case VPInstruction::StepVector:
-    O << "step-vector " << *ResultTy;
-    break;
-  case VPInstruction::VScale:
-    O << "vscale " << *ResultTy;
-    break;
-  default:
-    assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
-    O << Instruction::getOpcodeName(getOpcode()) << " ";
-    printOperands(O, SlotTracker);
-    O << " to " << *ResultTy;
-  }
-}
-#endif
-
-void VPPhi::execute(VPTransformState &State) {
-  State.setDebugLocFrom(getDebugLoc());
-  PHINode *NewPhi = State.Builder.CreatePHI(
-      State.TypeAnalysis.inferScalarType(this), 2, getName());
-  unsigned NumIncoming = getNumIncoming();
-  if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
-    // TODO: Fixup all incoming values of header phis once recipes defining them
-    // are introduced.
-    NumIncoming = 1;
-  }
-  for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
-    Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
-    BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
-    NewPhi->addIncoming(IncV, PredBB);
-  }
-  State.set(this, NewPhi, VPLane(0));
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
-                        VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printPhiOperands(O, SlotTracker);
-}
-#endif
-
-VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
-  if (auto *Phi = dyn_cast<PHINode>(&I))
-    return new VPIRPhi(*Phi);
-  return new VPIRInstruction(I);
-}
-
-void VPIRInstruction::execute(VPTransformState &State) {
-  assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
-         "PHINodes must be handled by VPIRPhi");
-  // Advance the insert point after the wrapped IR instruction. This allows
-  // interleaving VPIRInstructions and other recipes.
-  State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
-}
-
-InstructionCost VPIRInstruction::computeCost(ElementCount VF,
-                                             VPCostContext &Ctx) const {
-  // The recipe wraps an existing IR instruction on the border of VPlan's scope,
-  // hence it does not contribute to the cost-modeling for the VPlan.
-  return 0;
-}
-
-void VPIRInstruction::extractLastLaneOfLastPartOfFirstOperand(
-    VPBuilder &Builder) {
-  assert(isa<PHINode>(getInstruction()) &&
-         "can only update exiting operands to phi nodes");
-  assert(getNumOperands() > 0 && "must have at least one operand");
-  VPValue *Exiting = getOperand(0);
-  if (Exiting->isLiveIn())
-    return;
-
-  Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting);
-  Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting);
-  setOperand(0, Exiting);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
-  O << Indent << "IR " << I;
-}
-#endif
-
-void VPIRPhi::execute(VPTransformState &State) {
-  PHINode *Phi = &getIRPhi();
-  for (const auto &[Idx, Op] : enumerate(operands())) {
-    VPValue *ExitValue = Op;
-    auto Lane = vputils::isSingleScalar(ExitValue)
-                    ? VPLane::getFirstLane()
-                    : VPLane::getLastLaneForVF(State.VF);
-    VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
-    auto *PredVPBB = Pred->getExitingBasicBlock();
-    BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
-    // Set insertion point in PredBB in case an extract needs to be generated.
-    // TODO: Model extracts explicitly.
-    State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
-    Value *V = State.get(ExitValue, VPLane(Lane));
-    // If there is no existing block for PredBB in the phi, add a new incoming
-    // value. Otherwise update the existing incoming value for PredBB.
-    if (Phi->getBasicBlockIndex(PredBB) == -1)
-      Phi->addIncoming(V, PredBB);
-    else
-      Phi->setIncomingValueForBlock(PredBB, V);
-  }
-
-  // Advance the insert point after the wrapped IR instruction. This allows
-  // interleaving VPIRInstructions and other recipes.
-  State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
-}
-
-void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
-  VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
-  assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
-         "Number of phi operands must match number of predecessors");
-  unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
-  R->removeOperand(Position);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPPhiAccessors::printPhiOperands(raw_ostream &O,
-                                      VPSlotTracker &SlotTracker) const {
-  interleaveComma(enumerate(getAsRecipe()->operands()), O,
-                  [this, &O, &SlotTracker](auto Op) {
-                    O << "[ ";
-                    Op.value()->printAsOperand(O, SlotTracker);
-                    O << ", ";
-                    getIncomingBlock(Op.index())->printAsOperand(O);
-                    O << " ]";
-                  });
-}
-#endif
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
-  VPIRInstruction::printRecipe(O, Indent, SlotTracker);
-
-  if (getNumOperands() != 0) {
-    O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
-    interleaveComma(incoming_values_and_blocks(), O,
-                    [&O, &SlotTracker](auto Op) {
-                      std::get<0>(Op)->printAsOperand(O, SlotTracker);
-                      O << " from ";
-                      std::get<1>(Op)->printAsOperand(O);
-                    });
-    O << ")";
-  }
-}
-#endif
-
-void VPIRMetadata::applyMetadata(Instruction &I) const {
-  for (const auto &[Kind, Node] : Metadata)
-    I.setMetadata(Kind, Node);
-}
-
-void VPIRMetadata::intersect(const VPIRMetadata &Other) {
-  SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
-  for (const auto &[KindA, MDA] : Metadata) {
-    for (const auto &[KindB, MDB] : Other.Metadata) {
-      if (KindA == KindB && MDA == MDB) {
-        MetadataIntersection.emplace_back(KindA, MDA);
-        break;
-      }
-    }
-  }
-  Metadata = std::move(MetadataIntersection);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
-  const Module *M = SlotTracker.getModule();
-  if (Metadata.empty() || !M)
-    return;
-
-  ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
-  O << " (";
-  interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
-    auto [Kind, Node] = KindNodePair;
-    assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
-           "Unexpected unnamed metadata kind");
-    O << "!" << MDNames[Kind] << " ";
-    Node->printAsOperand(O, M);
-  });
-  O << ")";
-}
-#endif
-
-void VPWidenCallRecipe::execute(VPTransformState &State) {
-  assert(State.VF.isVector() && "not widening");
-  assert(Variant != nullptr && "Can't create vector function.");
-
-  FunctionType *VFTy = Variant->getFunctionType();
-  // Add return type if intrinsic is overloaded on it.
-  SmallVector<Value *, 4> Args;
-  for (const auto &I : enumerate(args())) {
-    Value *Arg;
-    // Some vectorized function variants may also take a scalar argument,
-    // e.g. linear parameters for pointers. This needs to be the scalar value
-    // from the start of the respective part when interleaving.
-    if (!VFTy->getParamType(I.index())->isVectorTy())
-      Arg = State.get(I.value(), VPLane(0));
-    else
-      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
-    Args.push_back(Arg);
-  }
-
-  auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
-  SmallVector<OperandBundleDef, 1> OpBundles;
-  if (CI)
-    CI->getOperandBundlesAsDefs(OpBundles);
-
-  CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
-  applyFlags(*V);
-  applyMetadata(*V);
-  V->setCallingConv(Variant->getCallingConv());
-
-  if (!V->getType()->isVoidTy())
-    State.set(this, V);
-}
-
-InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
-                                  Variant->getFunctionType()->params(),
-                                  Ctx.CostKind);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-CALL ";
-
-  Function *CalledFn = getCalledScalarFunction();
-  if (CalledFn->getReturnType()->isVoidTy())
-    O << "void ";
-  else {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-
-  O << "call";
-  printFlags(O);
-  O << " @" << CalledFn->getName() << "(";
-  interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
-    Op->printAsOperand(O, SlotTracker);
-  });
-  O << ")";
-
-  O << " (using library function";
-  if (Variant->hasName())
-    O << ": " << Variant->getName();
-  O << ")";
-}
-#endif
-
-void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
-  assert(State.VF.isVector() && "not widening");
-
-  SmallVector<Type *, 2> TysForDecl;
-  // Add return type if intrinsic is overloaded on it.
-  if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
-    TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
-  SmallVector<Value *, 4> Args;
-  for (const auto &I : enumerate(operands())) {
-    // Some intrinsics have a scalar argument - don't replace it with a
-    // vector.
-    Value *Arg;
-    if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
-                                           State.TTI))
-      Arg = State.get(I.value(), VPLane(0));
-    else
-      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
-    if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
-                                               State.TTI))
-      TysForDecl.push_back(Arg->getType());
-    Args.push_back(Arg);
-  }
-
-  // Use vector version of the intrinsic.
-  Module *M = State.Builder.GetInsertBlock()->getModule();
-  Function *VectorF =
-      Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
-  assert(VectorF &&
-         "Can't retrieve vector intrinsic or vector-predication intrinsics.");
-
-  auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
-  SmallVector<OperandBundleDef, 1> OpBundles;
-  if (CI)
-    CI->getOperandBundlesAsDefs(OpBundles);
-
-  CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
-
-  applyFlags(*V);
-  applyMetadata(*V);
-
-  if (!V->getType()->isVoidTy())
-    State.set(this, V);
-}
-
-/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
-static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
-                                            ArrayRef<const VPValue *> Operands,
-                                            const VPRecipeWithIRFlags &R,
-                                            ElementCount VF,
-                                            VPCostContext &Ctx) {
-  // Some backends analyze intrinsic arguments to determine cost. Use the
-  // underlying value for the operand if it has one. Otherwise try to use the
-  // operand of the underlying call instruction, if there is one. Otherwise
-  // clear Arguments.
-  // TODO: Rework TTI interface to be independent of concrete IR values.
-  SmallVector<const Value *> Arguments;
-  for (const auto &[Idx, Op] : enumerate(Operands)) {
-    auto *V = Op->getUnderlyingValue();
-    if (!V) {
-      if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
-        Arguments.push_back(UI->getArgOperand(Idx));
-        continue;
-      }
-      Arguments.clear();
-      break;
-    }
-    Arguments.push_back(V);
-  }
-
-  Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
-  Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
-  SmallVector<Type *> ParamTys;
-  for (const VPValue *Op : Operands) {
-    ParamTys.push_back(VF.isVector()
-                           ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
-                           : Ctx.Types.inferScalarType(Op));
-  }
-
-  // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
-  FastMathFlags FMF =
-      R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
-  IntrinsicCostAttributes CostAttrs(
-      ID, RetTy, Arguments, ParamTys, FMF,
-      dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
-      InstructionCost::getInvalid(), &Ctx.TLI);
-  return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
-}
-
-InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
-                                                    VPCostContext &Ctx) const {
-  SmallVector<const VPValue *> ArgOps(operands());
-  return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
-}
-
-StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
-  return Intrinsic::getBaseName(VectorIntrinsicID);
-}
-
-bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
-  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
-  return all_of(enumerate(operands()), [this, &Op](const auto &X) {
-    auto [Idx, V] = X;
-    return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
-                                                         Idx, nullptr);
-  });
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                         VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-INTRINSIC ";
-  if (ResultTy->isVoidTy()) {
-    O << "void ";
-  } else {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-
-  O << "call";
-  printFlags(O);
-  O << getIntrinsicName() << "(";
-
-  interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
-    Op->printAsOperand(O, SlotTracker);
-  });
-  O << ")";
-}
-#endif
-
-void VPHistogramRecipe::execute(VPTransformState &State) {
-  IRBuilderBase &Builder = State.Builder;
-
-  Value *Address = State.get(getOperand(0));
-  Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
-  VectorType *VTy = cast<VectorType>(Address->getType());
-
-  // The histogram intrinsic requires a mask even if the recipe doesn't;
-  // if the mask operand was omitted then all lanes should be executed and
-  // we just need to synthesize an all-true mask.
-  Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask())
-    Mask = State.get(VPMask);
-  else
-    Mask =
-        Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
-
-  // If this is a subtract, we want to invert the increment amount. We may
-  // add a separate intrinsic in future, but for now we'll try this.
-  if (Opcode == Instruction::Sub)
-    IncAmt = Builder.CreateNeg(IncAmt);
-  else
-    assert(Opcode == Instruction::Add && "only add or sub supported for now");
-
-  State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
-                                {VTy, IncAmt->getType()},
-                                {Address, IncAmt, Mask});
-}
-
-InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  // FIXME: Take the gather and scatter into account as well. For now we're
-  //        generating the same cost as the fallback path, but we'll likely
-  //        need to create a new TTI method for determining the cost, including
-  //        whether we can use base + vec-of-smaller-indices or just
-  //        vec-of-pointers.
-  assert(VF.isVector() && "Invalid VF for histogram cost");
-  Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
-  VPValue *IncAmt = getOperand(1);
-  Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
-  VectorType *VTy = VectorType::get(IncTy, VF);
-
-  // Assume that a non-constant update value (or a constant != 1) requires
-  // a multiply, and add that into the cost.
-  InstructionCost MulCost =
-      Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
-  if (IncAmt->isLiveIn()) {
-    ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
-
-    if (CI && CI->getZExtValue() == 1)
-      MulCost = TTI::TCC_Free;
-  }
-
-  // Find the cost of the histogram operation itself.
-  Type *PtrTy = VectorType::get(AddressTy, VF);
-  Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
-  IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
-                              Type::getVoidTy(Ctx.LLVMCtx),
-                              {PtrTy, IncTy, MaskTy});
-
-  // Add the costs together with the add/sub operation.
-  return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
-         Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-HISTOGRAM buckets: ";
-  getOperand(0)->printAsOperand(O, SlotTracker);
-
-  if (Opcode == Instruction::Sub)
-    O << ", dec: ";
-  else {
-    assert(Opcode == Instruction::Add);
-    O << ", inc: ";
-  }
-  getOperand(1)->printAsOperand(O, SlotTracker);
-
-  if (VPValue *Mask = getMask()) {
-    O << ", mask: ";
-    Mask->printAsOperand(O, SlotTracker);
-  }
-}
-
-void VPWidenSelectRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                      VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-SELECT ";
-  printAsOperand(O, SlotTracker);
-  O << " = select";
-  printFlags(O);
-  getOperand(0)->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getOperand(1)->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getOperand(2)->printAsOperand(O, SlotTracker);
-  O << (vputils::isSingleScalar(getCond()) ? " (condition is single-scalar)"
-                                           : "");
-}
-#endif
-
-void VPWidenSelectRecipe::execute(VPTransformState &State) {
-  Value *Cond = State.get(getCond(), vputils::isSingleScalar(getCond()));
-
-  Value *Op0 = State.get(getOperand(1));
-  Value *Op1 = State.get(getOperand(2));
-  Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
-  State.set(this, Sel);
-  if (auto *I = dyn_cast<Instruction>(Sel)) {
-    if (isa<FPMathOperator>(I))
-      applyFlags(*I);
-    applyMetadata(*I);
-  }
-}
-
-InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
-                                                 VPCostContext &Ctx) const {
-  SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
-  bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
-  Type *ScalarTy = Ctx.Types.inferScalarType(this);
-  Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-
-  VPValue *Op0, *Op1;
-  if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
-      (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
-       match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
-    // select x, y, false --> x & y
-    // select x, true, y --> x | y
-    const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
-    const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
-
-    SmallVector<const Value *, 2> Operands;
-    if (all_of(operands(),
-               [](VPValue *Op) { return Op->getUnderlyingValue(); }))
-      Operands.append(SI->op_begin(), SI->op_end());
-    bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
-    return Ctx.TTI.getArithmeticInstrCost(
-        IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
-        Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
-  }
-
-  Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
-  if (!ScalarCond)
-    CondTy = VectorType::get(CondTy, VF);
-
-  CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
-  if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
-    Pred = Cmp->getPredicate();
-  return Ctx.TTI.getCmpSelInstrCost(
-      Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
-      {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
-}
-
-VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
-  AllowReassoc = FMF.allowReassoc();
-  NoNaNs = FMF.noNaNs();
-  NoInfs = FMF.noInfs();
-  NoSignedZeros = FMF.noSignedZeros();
-  AllowReciprocal = FMF.allowReciprocal();
-  AllowContract = FMF.allowContract();
-  ApproxFunc = FMF.approxFunc();
-}
-
-#if !defined(NDEBUG)
-bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
-  switch (OpType) {
-  case OperationType::OverflowingBinOp:
-    return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
-           Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
-           Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
-  case OperationType::Trunc:
-    return Opcode == Instruction::Trunc;
-  case OperationType::DisjointOp:
-    return Opcode == Instruction::Or;
-  case OperationType::PossiblyExactOp:
-    return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
-           Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
-  case OperationType::GEPOp:
-    return Opcode == Instruction::GetElementPtr ||
-           Opcode == VPInstruction::PtrAdd ||
-           Opcode == VPInstruction::WidePtrAdd;
-  case OperationType::FPMathOp:
-    return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
-           Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
-           Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
-           Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
-           Opcode == Instruction::FPTrunc || Opcode == Instruction::Select ||
-           Opcode == VPInstruction::WideIVStep ||
-           Opcode == VPInstruction::ReductionStartVector ||
-           Opcode == VPInstruction::ComputeReductionResult;
-  case OperationType::FCmp:
-    return Opcode == Instruction::FCmp;
-  case OperationType::NonNegOp:
-    return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
-  case OperationType::Cmp:
-    return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
-  case OperationType::Other:
-    return true;
-  }
-  llvm_unreachable("Unknown OperationType enum");
-}
-#endif
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPIRFlags::printFlags(raw_ostream &O) const {
-  switch (OpType) {
-  case OperationType::Cmp:
-    O << " " << CmpInst::getPredicateName(getPredicate());
-    break;
-  case OperationType::FCmp:
-    O << " " << CmpInst::getPredicateName(getPredicate());
-    getFastMathFlags().print(O);
-    break;
-  case OperationType::DisjointOp:
-    if (DisjointFlags.IsDisjoint)
-      O << " disjoint";
-    break;
-  case OperationType::PossiblyExactOp:
-    if (ExactFlags.IsExact)
-      O << " exact";
-    break;
-  case OperationType::OverflowingBinOp:
-    if (WrapFlags.HasNUW)
-      O << " nuw";
-    if (WrapFlags.HasNSW)
-      O << " nsw";
-    break;
-  case OperationType::Trunc:
-    if (TruncFlags.HasNUW)
-      O << " nuw";
-    if (TruncFlags.HasNSW)
-      O << " nsw";
-    break;
-  case OperationType::FPMathOp:
-    getFastMathFlags().print(O);
-    break;
-  case OperationType::GEPOp:
-    if (GEPFlags.isInBounds())
-      O << " inbounds";
-    else if (GEPFlags.hasNoUnsignedSignedWrap())
-      O << " nusw";
-    if (GEPFlags.hasNoUnsignedWrap())
-      O << " nuw";
-    break;
-  case OperationType::NonNegOp:
-    if (NonNegFlags.NonNeg)
-      O << " nneg";
-    break;
-  case OperationType::Other:
-    break;
-  }
-  O << " ";
-}
-#endif
-
-void VPWidenRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  switch (Opcode) {
-  case Instruction::Call:
-  case Instruction::Br:
-  case Instruction::PHI:
-  case Instruction::GetElementPtr:
-  case Instruction::Select:
-    llvm_unreachable("This instruction is handled by a different recipe.");
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::URem:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::FNeg:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::FDiv:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    // Just widen unops and binops.
-    SmallVector<Value *, 2> Ops;
-    for (VPValue *VPOp : operands())
-      Ops.push_back(State.get(VPOp));
-
-    Value *V = Builder.CreateNAryOp(Opcode, Ops);
-
-    if (auto *VecOp = dyn_cast<Instruction>(V)) {
-      applyFlags(*VecOp);
-      applyMetadata(*VecOp);
-    }
-
-    // Use this vector value for all users of the original instruction.
-    State.set(this, V);
-    break;
-  }
-  case Instruction::ExtractValue: {
-    assert(getNumOperands() == 2 && "expected single level extractvalue");
-    Value *Op = State.get(getOperand(0));
-    auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
-    Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
-    State.set(this, Extract);
-    break;
-  }
-  case Instruction::Freeze: {
-    Value *Op = State.get(getOperand(0));
-    Value *Freeze = Builder.CreateFreeze(Op);
-    State.set(this, Freeze);
-    break;
-  }
-  case Instruction::ICmp:
-  case Instruction::FCmp: {
-    // Widen compares. Generate vector compares.
-    bool FCmp = Opcode == Instruction::FCmp;
-    Value *A = State.get(getOperand(0));
-    Value *B = State.get(getOperand(1));
-    Value *C = nullptr;
-    if (FCmp) {
-      C = Builder.CreateFCmp(getPredicate(), A, B);
-    } else {
-      C = Builder.CreateICmp(getPredicate(), A, B);
-    }
-    if (auto *I = dyn_cast<Instruction>(C)) {
-      applyFlags(*I);
-      applyMetadata(*I);
-    }
-    State.set(this, C);
-    break;
-  }
-  default:
-    // This instruction is not vectorized by simple widening.
-    LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
-                      << Instruction::getOpcodeName(Opcode));
-    llvm_unreachable("Unhandled instruction!");
-  } // end of switch.
-
-#if !defined(NDEBUG)
-  // Verify that VPlan type inference results agree with the type of the
-  // generated values.
-  assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
-             State.get(this)->getType() &&
-         "inferred type and type from generated instructions do not match");
-#endif
-}
-
-InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
-                                           VPCostContext &Ctx) const {
-  switch (Opcode) {
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::URem:
-    // If the div/rem operation isn't safe to speculate and requires
-    // predication, then the only way we can even create a vplan is to insert
-    // a select on the second input operand to ensure we use the value of 1
-    // for the inactive lanes. The select will be costed separately.
-  case Instruction::FNeg:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::FDiv:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Freeze:
-  case Instruction::ExtractValue:
-  case Instruction::ICmp:
-  case Instruction::FCmp:
-    return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
-  default:
-    llvm_unreachable("Unsupported opcode for instruction");
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN ";
-  printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(Opcode);
-  printFlags(O);
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPWidenCastRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  /// Vectorize casts.
-  assert(State.VF.isVector() && "Not vectorizing?");
-  Type *DestTy = VectorType::get(getResultType(), State.VF);
-  VPValue *Op = getOperand(0);
-  Value *A = State.get(Op);
-  Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
-  State.set(this, Cast);
-  if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
-    applyFlags(*CastOp);
-    applyMetadata(*CastOp);
-  }
-}
-
-InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  // TODO: In some cases, VPWidenCastRecipes are created but not considered in
-  // the legacy cost model, including truncates/extends when evaluating a
-  // reduction in a smaller type.
-  if (!getUnderlyingValue())
-    return 0;
-  // Computes the CastContextHint from a recipes that may access memory.
-  auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
-    if (VF.isScalar())
-      return TTI::CastContextHint::Normal;
-    if (isa<VPInterleaveBase>(R))
-      return TTI::CastContextHint::Interleave;
-    if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
-      return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
-                                             : TTI::CastContextHint::Normal;
-    const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
-    if (WidenMemoryRecipe == nullptr)
-      return TTI::CastContextHint::None;
-    if (!WidenMemoryRecipe->isConsecutive())
-      return TTI::CastContextHint::GatherScatter;
-    if (WidenMemoryRecipe->isReverse())
-      return TTI::CastContextHint::Reversed;
-    if (WidenMemoryRecipe->isMasked())
-      return TTI::CastContextHint::Masked;
-    return TTI::CastContextHint::Normal;
-  };
-
-  VPValue *Operand = getOperand(0);
-  TTI::CastContextHint CCH = TTI::CastContextHint::None;
-  // For Trunc/FPTrunc, get the context from the only user.
-  if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
-      !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
-    if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
-      CCH = ComputeCCH(StoreRecipe);
-  }
-  // For Z/Sext, get the context from the operand.
-  else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
-           Opcode == Instruction::FPExt) {
-    if (Operand->isLiveIn())
-      CCH = TTI::CastContextHint::Normal;
-    else if (Operand->getDefiningRecipe())
-      CCH = ComputeCCH(Operand->getDefiningRecipe());
-  }
-
-  auto *SrcTy =
-      cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
-  auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
-  // Arm TTI will use the underlying instruction to determine the cost.
-  return Ctx.TTI.getCastInstrCost(
-      Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
-      dyn_cast_if_present<Instruction>(getUnderlyingValue()));
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-CAST ";
-  printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(Opcode);
-  printFlags(O);
-  printOperands(O, SlotTracker);
-  O << " to " << *getResultType();
-}
-#endif
-
-InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
-}
-
-/// A helper function that returns an integer or floating-point constant with
-/// value C.
-static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
-  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
-                           : ConstantFP::get(Ty, C);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenIntOrFpInductionRecipe::printRecipe(
-    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
-  O << Indent;
-  printAsOperand(O, SlotTracker);
-  O << " = WIDEN-INDUCTION";
-  printFlags(O);
-  O << " ";
-  printOperands(O, SlotTracker);
-
-  if (auto *TI = getTruncInst())
-    O << " (truncated to " << *TI->getType() << ")";
-}
-#endif
-
-bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
-  // The step may be defined by a recipe in the preheader (e.g. if it requires
-  // SCEV expansion), but for the canonical induction the step is required to be
-  // 1, which is represented as live-in.
-  if (getStepValue()->getDefiningRecipe())
-    return false;
-  auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
-  auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
-  return StartC && StartC->isZero() && StepC && StepC->isOne() &&
-         getScalarType() == getRegion()->getCanonicalIVType();
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  O << Indent;
-  printAsOperand(O, SlotTracker);
-  O << " = DERIVED-IV ";
-  getStartValue()->printAsOperand(O, SlotTracker);
-  O << " + ";
-  getOperand(1)->printAsOperand(O, SlotTracker);
-  O << " * ";
-  getStepValue()->printAsOperand(O, SlotTracker);
-}
-#endif
-
-void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
-  // Fast-math-flags propagate from the original induction instruction.
-  IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
-  if (hasFastMathFlags())
-    State.Builder.setFastMathFlags(getFastMathFlags());
-
-  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
-  /// variable on which to base the steps, \p Step is the size of the step.
-
-  Value *BaseIV = State.get(getOperand(0), VPLane(0));
-  Value *Step = State.get(getStepValue(), VPLane(0));
-  IRBuilderBase &Builder = State.Builder;
-
-  // Ensure step has the same type as that of scalar IV.
-  Type *BaseIVTy = BaseIV->getType()->getScalarType();
-  assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
-
-  // We build scalar steps for both integer and floating-point induction
-  // variables. Here, we determine the kind of arithmetic we will perform.
-  Instruction::BinaryOps AddOp;
-  Instruction::BinaryOps MulOp;
-  if (BaseIVTy->isIntegerTy()) {
-    AddOp = Instruction::Add;
-    MulOp = Instruction::Mul;
-  } else {
-    AddOp = InductionOpcode;
-    MulOp = Instruction::FMul;
-  }
-
-  // Determine the number of scalars we need to generate for each unroll
-  // iteration.
-  bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
-  // Compute the scalar steps and save the results in State.
-  Type *IntStepTy =
-      IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
-
-  unsigned StartLane = 0;
-  unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
-  if (State.Lane) {
-    StartLane = State.Lane->getKnownLane();
-    EndLane = StartLane + 1;
-  }
-  Value *StartIdx0;
-  if (getUnrollPart(*this) == 0)
-    StartIdx0 = ConstantInt::get(IntStepTy, 0);
-  else {
-    StartIdx0 = State.get(getOperand(2), true);
-    if (getUnrollPart(*this) != 1) {
-      StartIdx0 =
-          Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
-                                                        getUnrollPart(*this)));
-    }
-    StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
-  }
-
-  if (BaseIVTy->isFloatingPointTy())
-    StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
-
-  for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
-    Value *StartIdx = Builder.CreateBinOp(
-        AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
-    // The step returned by `createStepForVF` is a runtime-evaluated value
-    // when VF is scalable. Otherwise, it should be folded into a Constant.
-    assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
-           "Expected StartIdx to be folded to a constant when VF is not "
-           "scalable");
-    auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
-    auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
-    State.set(this, Add, VPLane(Lane));
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                        VPSlotTracker &SlotTracker) const {
-  O << Indent;
-  printAsOperand(O, SlotTracker);
-  O << " = SCALAR-STEPS ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
-  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
-  return vputils::isSingleScalar(Op);
-}
-
-void VPWidenGEPRecipe::execute(VPTransformState &State) {
-  assert(State.VF.isVector() && "not widening");
-  // Construct a vector GEP by widening the operands of the scalar GEP as
-  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
-  // results in a vector of pointers when at least one operand of the GEP
-  // is vector-typed. Thus, to keep the representation compact, we only use
-  // vector-typed operands for loop-varying values.
-
-  assert(
-      any_of(operands(),
-             [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); }) &&
-      "Expected at least one loop-variant operand");
-
-  // If the GEP has at least one loop-varying operand, we are sure to
-  // produce a vector of pointers unless VF is scalar.
-  // The pointer operand of the new GEP. If it's loop-invariant, we
-  // won't broadcast it.
-  auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
-
-  // Collect all the indices for the new GEP. If any index is
-  // loop-invariant, we won't broadcast it.
-  SmallVector<Value *, 4> Indices;
-  for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
-    VPValue *Operand = getOperand(I);
-    Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
-  }
-
-  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
-  // but it should be a vector, otherwise.
-  auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
-                                         "", getGEPNoWrapFlags());
-  assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
-         "NewGEP is not a pointer vector");
-  State.set(this, NewGEP);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                   VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-GEP ";
-  O << (isPointerLoopInvariant() ? "Inv" : "Var");
-  for (size_t I = 0; I < getNumOperands() - 1; ++I)
-    O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
-
-  O << " ";
-  printAsOperand(O, SlotTracker);
-  O << " = getelementptr";
-  printFlags(O);
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  unsigned CurrentPart = getUnrollPart(*this);
-  const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
-  Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this));
-
-  // The wide store needs to start at the last vector element.
-  Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
-  if (IndexTy != RunTimeVF->getType())
-    RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
-  // NumElt = Stride * CurrentPart * RunTimeVF
-  Value *NumElt = Builder.CreateMul(
-      ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
-  // LastLane = Stride * (RunTimeVF - 1)
-  Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
-  if (Stride != 1)
-    LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
-  Value *Ptr = State.get(getOperand(0), VPLane(0));
-  Value *ResultPtr =
-      Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
-  ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
-                                getGEPNoWrapFlags());
-
-  State.set(this, ResultPtr, /*IsScalar*/ true);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                           VPSlotTracker &SlotTracker) const {
-  O << Indent;
-  printAsOperand(O, SlotTracker);
-  O << " = vector-end-pointer";
-  printFlags(O);
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPVectorPointerRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  unsigned CurrentPart = getUnrollPart(*this);
-  const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
-  Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this));
-  Value *Ptr = State.get(getOperand(0), VPLane(0));
-
-  Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
-  Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
-                                       "", getGEPNoWrapFlags());
-
-  State.set(this, ResultPtr, /*IsScalar*/ true);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                        VPSlotTracker &SlotTracker) const {
-  O << Indent;
-  printAsOperand(O, SlotTracker);
-  O << " = vector-pointer";
-  printFlags(O);
-  printOperands(O, SlotTracker);
-}
-#endif
-
-InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
-                                           VPCostContext &Ctx) const {
-  Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-  Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
-  return (getNumIncomingValues() - 1) *
-         Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
-                                    CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
-  O << Indent << "BLEND ";
-  printAsOperand(O, SlotTracker);
-  O << " =";
-  if (getNumIncomingValues() == 1) {
-    // Not a User of any mask: not really blending, this is a
-    // single-predecessor phi.
-    O << " ";
-    getIncomingValue(0)->printAsOperand(O, SlotTracker);
-  } else {
-    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
-      O << " ";
-      getIncomingValue(I)->printAsOperand(O, SlotTracker);
-      if (I == 0)
-        continue;
-      O << "/";
-      getMask(I)->printAsOperand(O, SlotTracker);
-    }
-  }
-}
-#endif
-
-void VPReductionRecipe::execute(VPTransformState &State) {
-  assert(!State.Lane && "Reduction being replicated.");
-  RecurKind Kind = getRecurrenceKind();
-  assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
-         "In-loop AnyOf reductions aren't currently supported");
-  // Propagate the fast-math flags carried by the underlying instruction.
-  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
-  State.Builder.setFastMathFlags(getFastMathFlags());
-  Value *NewVecOp = State.get(getVecOp());
-  if (VPValue *Cond = getCondOp()) {
-    Value *NewCond = State.get(Cond, State.VF.isScalar());
-    VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
-    Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
-
-    Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
-    if (State.VF.isVector())
-      Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
-
-    Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
-    NewVecOp = Select;
-  }
-  Value *NewRed;
-  Value *NextInChain;
-  if (isOrdered()) {
-    Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
-    if (State.VF.isVector())
-      NewRed =
-          createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
-    else
-      NewRed = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
-          PrevInChain, NewVecOp);
-    PrevInChain = NewRed;
-    NextInChain = NewRed;
-  } else if (isPartialReduction()) {
-    assert(Kind == RecurKind::Add && "Unexpected partial reduction kind");
-    Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
-    NewRed = State.Builder.CreateIntrinsic(
-        PrevInChain->getType(), Intrinsic::vector_partial_reduce_add,
-        {PrevInChain, NewVecOp}, nullptr, "partial.reduce");
-    PrevInChain = NewRed;
-    NextInChain = NewRed;
-  } else {
-    assert(isInLoop() &&
-           "The reduction must either be ordered, partial or in-loop");
-    Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
-    NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
-    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
-      NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
-    else
-      NextInChain = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
-          PrevInChain, NewRed);
-  }
-  State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
-}
-
-void VPReductionEVLRecipe::execute(VPTransformState &State) {
-  assert(!State.Lane && "Reduction being replicated.");
-
-  auto &Builder = State.Builder;
-  // Propagate the fast-math flags carried by the underlying instruction.
-  IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
-  Builder.setFastMathFlags(getFastMathFlags());
-
-  RecurKind Kind = getRecurrenceKind();
-  Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
-  Value *VecOp = State.get(getVecOp());
-  Value *EVL = State.get(getEVL(), VPLane(0));
-
-  Value *Mask;
-  if (VPValue *CondOp = getCondOp())
-    Mask = State.get(CondOp);
-  else
-    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-
-  Value *NewRed;
-  if (isOrdered()) {
-    NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
-  } else {
-    NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
-    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
-      NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
-    else
-      NewRed = Builder.CreateBinOp(
-          (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed,
-          Prev);
-  }
-  State.set(this, NewRed, /*IsScalar*/ true);
-}
-
-InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  RecurKind RdxKind = getRecurrenceKind();
-  Type *ElementTy = Ctx.Types.inferScalarType(this);
-  auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
-  unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
-  FastMathFlags FMFs = getFastMathFlags();
-  std::optional<FastMathFlags> OptionalFMF =
-      ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
-
-  if (isPartialReduction()) {
-    InstructionCost CondCost = 0;
-    if (isConditional()) {
-      CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
-      auto *CondTy = cast<VectorType>(
-          toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
-      CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
-                                            CondTy, Pred, Ctx.CostKind);
-    }
-    return CondCost + Ctx.TTI.getPartialReductionCost(
-                          Opcode, ElementTy, ElementTy, ElementTy, VF,
-                          TargetTransformInfo::PR_None,
-                          TargetTransformInfo::PR_None, std::nullopt,
-                          Ctx.CostKind);
-  }
-
-  // TODO: Support any-of reductions.
-  assert(
-      (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
-       ForceTargetInstructionCost.getNumOccurrences() > 0) &&
-      "Any-of reduction not implemented in VPlan-based cost model currently.");
-
-  // Note that TTI should model the cost of moving result to the scalar register
-  // and the BinOp cost in the getMinMaxReductionCost().
-  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
-    Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
-    return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
-  }
-
-  // Note that TTI should model the cost of moving result to the scalar register
-  // and the BinOp cost in the getArithmeticReductionCost().
-  return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
-                                            Ctx.CostKind);
-}
-
-VPExpressionRecipe::VPExpressionRecipe(
-    ExpressionTypes ExpressionType,
-    ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
-    : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
-      ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
-  assert(!ExpressionRecipes.empty() && "Nothing to combine?");
-  assert(
-      none_of(ExpressionRecipes,
-              [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
-      "expression cannot contain recipes with side-effects");
-
-  // Maintain a copy of the expression recipes as a set of users.
-  SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
-  for (auto *R : ExpressionRecipes)
-    ExpressionRecipesAsSetOfUsers.insert(R);
-
-  // Recipes in the expression, except the last one, must only be used by
-  // (other) recipes inside the expression. If there are other users, external
-  // to the expression, use a clone of the recipe for external users.
-  for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
-    if (R != ExpressionRecipes.back() &&
-        any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
-          return !ExpressionRecipesAsSetOfUsers.contains(U);
-        })) {
-      // There are users outside of the expression. Clone the recipe and use the
-      // clone those external users.
-      VPSingleDefRecipe *CopyForExtUsers = R->clone();
-      R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
-                                                VPUser &U, unsigned) {
-        return !ExpressionRecipesAsSetOfUsers.contains(&U);
-      });
-      CopyForExtUsers->insertBefore(R);
-    }
-    if (R->getParent())
-      R->removeFromParent();
-  }
-
-  // Internalize all external operands to the expression recipes. To do so,
-  // create new temporary VPValues for all operands defined by a recipe outside
-  // the expression. The original operands are added as operands of the
-  // VPExpressionRecipe itself.
-  for (auto *R : ExpressionRecipes) {
-    for (const auto &[Idx, Op] : enumerate(R->operands())) {
-      auto *Def = Op->getDefiningRecipe();
-      if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
-        continue;
-      addOperand(Op);
-      LiveInPlaceholders.push_back(new VPValue());
-    }
-  }
-
-  // Replace each external operand with the first one created for it in
-  // LiveInPlaceholders.
-  for (auto *R : ExpressionRecipes)
-    for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
-      R->replaceUsesOfWith(LiveIn, Tmp);
-}
-
-void VPExpressionRecipe::decompose() {
-  for (auto *R : ExpressionRecipes)
-    // Since the list could contain duplicates, make sure the recipe hasn't
-    // already been inserted.
-    if (!R->getParent())
-      R->insertBefore(this);
-
-  for (const auto &[Idx, Op] : enumerate(operands()))
-    LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
-
-  replaceAllUsesWith(ExpressionRecipes.back());
-  ExpressionRecipes.clear();
-}
-
-InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
-                                                VPCostContext &Ctx) const {
-  Type *RedTy = Ctx.Types.inferScalarType(this);
-  auto *SrcVecTy = cast<VectorType>(
-      toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
-  assert(RedTy->isIntegerTy() &&
-         "VPExpressionRecipe only supports integer types currently.");
-  unsigned Opcode = RecurrenceDescriptor::getOpcode(
-      cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
-  switch (ExpressionType) {
-  case ExpressionTypes::ExtendedReduction: {
-    unsigned Opcode = RecurrenceDescriptor::getOpcode(
-        cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
-    auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
-
-    return cast<VPReductionRecipe>(ExpressionRecipes.back())
-                   ->isPartialReduction()
-               ? Ctx.TTI.getPartialReductionCost(
-                     Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
-                     RedTy, VF,
-                     TargetTransformInfo::getPartialReductionExtendKind(
-                         ExtR->getOpcode()),
-                     TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
-               : Ctx.TTI.getExtendedReductionCost(
-                     Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
-                     SrcVecTy, std::nullopt, Ctx.CostKind);
-  }
-  case ExpressionTypes::MulAccReduction:
-    return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
-                                          Ctx.CostKind);
-
-  case ExpressionTypes::ExtNegatedMulAccReduction:
-    assert(Opcode == Instruction::Add && "Unexpected opcode");
-    Opcode = Instruction::Sub;
-    [[fallthrough]];
-  case ExpressionTypes::ExtMulAccReduction: {
-    auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
-    if (RedR->isPartialReduction()) {
-      auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
-      auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
-      auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
-      return Ctx.TTI.getPartialReductionCost(
-          Opcode, Ctx.Types.inferScalarType(getOperand(0)),
-          Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
-          TargetTransformInfo::getPartialReductionExtendKind(
-              Ext0R->getOpcode()),
-          TargetTransformInfo::getPartialReductionExtendKind(
-              Ext1R->getOpcode()),
-          Mul->getOpcode(), Ctx.CostKind);
-    }
-    return Ctx.TTI.getMulAccReductionCost(
-        cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
-            Instruction::ZExt,
-        Opcode, RedTy, SrcVecTy, Ctx.CostKind);
-  }
-  }
-  llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
-}
-
-bool VPExpressionRecipe::mayReadOrWriteMemory() const {
-  return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
-    return R->mayReadFromMemory() || R->mayWriteToMemory();
-  });
-}
-
-bool VPExpressionRecipe::mayHaveSideEffects() const {
-  assert(
-      none_of(ExpressionRecipes,
-              [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
-      "expression cannot contain recipes with side-effects");
-  return false;
-}
-
-bool VPExpressionRecipe::isSingleScalar() const {
-  // Cannot use vputils::isSingleScalar(), because all external operands
-  // of the expression will be live-ins while bundled.
-  auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
-  return RR && !RR->isPartialReduction();
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-
-void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
-  O << Indent << "EXPRESSION ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
-  unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
-
-  switch (ExpressionType) {
-  case ExpressionTypes::ExtendedReduction: {
-    getOperand(1)->printAsOperand(O, SlotTracker);
-    O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
-    O << Instruction::getOpcodeName(Opcode) << " (";
-    getOperand(0)->printAsOperand(O, SlotTracker);
-    Red->printFlags(O);
-
-    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
-    O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
-      << *Ext0->getResultType();
-    if (Red->isConditional()) {
-      O << ", ";
-      Red->getCondOp()->printAsOperand(O, SlotTracker);
-    }
-    O << ")";
-    break;
-  }
-  case ExpressionTypes::ExtNegatedMulAccReduction: {
-    getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
-    O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
-    O << Instruction::getOpcodeName(
-             RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
-      << " (sub (0, mul";
-    auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
-    Mul->printFlags(O);
-    O << "(";
-    getOperand(0)->printAsOperand(O, SlotTracker);
-    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
-    O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
-      << *Ext0->getResultType() << "), (";
-    getOperand(1)->printAsOperand(O, SlotTracker);
-    auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
-    O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
-      << *Ext1->getResultType() << ")";
-    if (Red->isConditional()) {
-      O << ", ";
-      Red->getCondOp()->printAsOperand(O, SlotTracker);
-    }
-    O << "))";
-    break;
-  }
-  case ExpressionTypes::MulAccReduction:
-  case ExpressionTypes::ExtMulAccReduction: {
-    getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
-    O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
-    O << Instruction::getOpcodeName(
-             RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
-      << " (";
-    O << "mul";
-    bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
-    auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
-                                               : ExpressionRecipes[0]);
-    Mul->printFlags(O);
-    if (IsExtended)
-      O << "(";
-    getOperand(0)->printAsOperand(O, SlotTracker);
-    if (IsExtended) {
-      auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
-      O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
-        << *Ext0->getResultType() << "), (";
-    } else {
-      O << ", ";
-    }
-    getOperand(1)->printAsOperand(O, SlotTracker);
-    if (IsExtended) {
-      auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
-      O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
-        << *Ext1->getResultType() << ")";
-    }
-    if (Red->isConditional()) {
-      O << ", ";
-      Red->getCondOp()->printAsOperand(O, SlotTracker);
-    }
-    O << ")";
-    break;
-  }
-  }
-}
-
-void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  if (isPartialReduction())
-    O << Indent << "PARTIAL-REDUCE ";
-  else
-    O << Indent << "REDUCE ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  getChainOp()->printAsOperand(O, SlotTracker);
-  O << " +";
-  printFlags(O);
-  O << " reduce."
-    << Instruction::getOpcodeName(
-           RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
-    << " (";
-  getVecOp()->printAsOperand(O, SlotTracker);
-  if (isConditional()) {
-    O << ", ";
-    getCondOp()->printAsOperand(O, SlotTracker);
-  }
-  O << ")";
-}
-
-void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                       VPSlotTracker &SlotTracker) const {
-  O << Indent << "REDUCE ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  getChainOp()->printAsOperand(O, SlotTracker);
-  O << " +";
-  printFlags(O);
-  O << " vp.reduce."
-    << Instruction::getOpcodeName(
-           RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
-    << " (";
-  getVecOp()->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getEVL()->printAsOperand(O, SlotTracker);
-  if (isConditional()) {
-    O << ", ";
-    getCondOp()->printAsOperand(O, SlotTracker);
-  }
-  O << ")";
-}
-
-#endif
-
-/// A helper function to scalarize a single Instruction in the innermost loop.
-/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
-/// operands from \p RepRecipe instead of \p Instr's operands.
-static void scalarizeInstruction(const Instruction *Instr,
-                                 VPReplicateRecipe *RepRecipe,
-                                 const VPLane &Lane, VPTransformState &State) {
-  assert((!Instr->getType()->isAggregateType() ||
-          canVectorizeTy(Instr->getType())) &&
-         "Expected vectorizable or non-aggregate type.");
-
-  // Does this instruction return a value ?
-  bool IsVoidRetTy = Instr->getType()->isVoidTy();
-
-  Instruction *Cloned = Instr->clone();
-  if (!IsVoidRetTy) {
-    Cloned->setName(Instr->getName() + ".cloned");
-    Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
-    // The operands of the replicate recipe may have been narrowed, resulting in
-    // a narrower result type. Update the type of the cloned instruction to the
-    // correct type.
-    if (ResultTy != Cloned->getType())
-      Cloned->mutateType(ResultTy);
-  }
-
-  RepRecipe->applyFlags(*Cloned);
-  RepRecipe->applyMetadata(*Cloned);
-
-  if (RepRecipe->hasPredicate())
-    cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
-
-  if (auto DL = RepRecipe->getDebugLoc())
-    State.setDebugLocFrom(DL);
-
-  // Replace the operands of the cloned instructions with their scalar
-  // equivalents in the new loop.
-  for (const auto &I : enumerate(RepRecipe->operands())) {
-    auto InputLane = Lane;
-    VPValue *Operand = I.value();
-    if (vputils::isSingleScalar(Operand))
-      InputLane = VPLane::getFirstLane();
-    Cloned->setOperand(I.index(), State.get(Operand, InputLane));
-  }
-
-  // Place the cloned scalar in the new loop.
-  State.Builder.Insert(Cloned);
-
-  State.set(RepRecipe, Cloned, Lane);
-
-  // If we just cloned a new assumption, add it the assumption cache.
-  if (auto *II = dyn_cast<AssumeInst>(Cloned))
-    State.AC->registerAssumption(II);
-
-  assert(
-      (RepRecipe->getRegion() ||
-       !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
-       all_of(RepRecipe->operands(),
-              [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
-      "Expected a recipe is either within a region or all of its operands "
-      "are defined outside the vectorized region.");
-}
-
-void VPReplicateRecipe::execute(VPTransformState &State) {
-  Instruction *UI = getUnderlyingInstr();
-
-  if (!State.Lane) {
-    assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
-                             "must have already been unrolled");
-    scalarizeInstruction(UI, this, VPLane(0), State);
-    return;
-  }
-
-  assert((State.VF.isScalar() || !isSingleScalar()) &&
-         "uniform recipe shouldn't be predicated");
-  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-  scalarizeInstruction(UI, this, *State.Lane, State);
-  // Insert scalar instance packing it into a vector.
-  if (State.VF.isVector() && shouldPack()) {
-    Value *WideValue =
-        State.Lane->isFirstLane()
-            ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
-            : State.get(this);
-    State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
-                                                        *State.Lane));
-  }
-}
-
-bool VPReplicateRecipe::shouldPack() const {
-  // Find if the recipe is used by a widened recipe via an intervening
-  // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
-  return any_of(users(), [](const VPUser *U) {
-    if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
-      return !vputils::onlyScalarValuesUsed(PredR);
-    return false;
-  });
-}
-
-/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
-/// which the legacy cost model computes a SCEV expression when computing the
-/// address cost. Computing SCEVs for VPValues is incomplete and returns
-/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
-/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
-static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
-                                        const Loop *L) {
-  auto *PtrR = Ptr->getDefiningRecipe();
-  if (!PtrR || !((isa<VPReplicateRecipe>(Ptr) &&
-                  cast<VPReplicateRecipe>(Ptr)->getOpcode() ==
-                      Instruction::GetElementPtr) ||
-                 isa<VPWidenGEPRecipe>(Ptr) ||
-                 match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
-    return nullptr;
-
-  // We are looking for a GEP where all indices are either loop invariant or
-  // inductions.
-  for (VPValue *Opd : drop_begin(PtrR->operands())) {
-    if (!Opd->isDefinedOutsideLoopRegions() &&
-        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
-      return nullptr;
-  }
-
-  return vputils::getSCEVExprForVPValue(Ptr, SE, L);
-}
-
-/// Returns true if \p V is used as part of the address of another load or
-/// store.
-static bool isUsedByLoadStoreAddress(const VPUser *V) {
-  SmallPtrSet<const VPUser *, 4> Seen;
-  SmallVector<const VPUser *> WorkList = {V};
-
-  while (!WorkList.empty()) {
-    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
-    if (!Cur || !Seen.insert(Cur).second)
-      continue;
-
-    auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
-    // Skip blends that use V only through a compare by checking if any incoming
-    // value was already visited.
-    if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
-                         [&](unsigned I) {
-                           return Seen.contains(
-                               Blend->getIncomingValue(I)->getDefiningRecipe());
-                         }))
-      continue;
-
-    for (VPUser *U : Cur->users()) {
-      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
-        if (InterleaveR->getAddr() == Cur)
-          return true;
-      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
-        if (RepR->getOpcode() == Instruction::Load &&
-            RepR->getOperand(0) == Cur)
-          return true;
-        if (RepR->getOpcode() == Instruction::Store &&
-            RepR->getOperand(1) == Cur)
-          return true;
-      }
-      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
-        if (MemR->getAddr() == Cur && MemR->isConsecutive())
-          return true;
-      }
-    }
-
-    // The legacy cost model only supports scalarization loads/stores with phi
-    // addresses, if the phi is directly used as load/store address. Don't
-    // traverse further for Blends.
-    if (Blend)
-      continue;
-
-    append_range(WorkList, Cur->users());
-  }
-  return false;
-}
-
-InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
-                                               VPCostContext &Ctx) const {
-  Instruction *UI = cast<Instruction>(getUnderlyingValue());
-  // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
-  // transform, avoid computing their cost multiple times for now.
-  Ctx.SkipCostComputation.insert(UI);
-
-  if (VF.isScalable() && !isSingleScalar())
-    return InstructionCost::getInvalid();
-
-  switch (UI->getOpcode()) {
-  case Instruction::GetElementPtr:
-    // We mark this instruction as zero-cost because the cost of GEPs in
-    // vectorized code depends on whether the corresponding memory instruction
-    // is scalarized or not. Therefore, we handle GEPs with the memory
-    // instruction cost.
-    return 0;
-  case Instruction::Call: {
-    auto *CalledFn =
-        cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
-
-    SmallVector<const VPValue *> ArgOps(drop_end(operands()));
-    SmallVector<Type *, 4> Tys;
-    for (const VPValue *ArgOp : ArgOps)
-      Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
-
-    if (CalledFn->isIntrinsic())
-      // Various pseudo-intrinsics with costs of 0 are scalarized instead of
-      // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
-      switch (CalledFn->getIntrinsicID()) {
-      case Intrinsic::assume:
-      case Intrinsic::lifetime_end:
-      case Intrinsic::lifetime_start:
-      case Intrinsic::sideeffect:
-      case Intrinsic::pseudoprobe:
-      case Intrinsic::experimental_noalias_scope_decl: {
-        assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
-                                    ElementCount::getFixed(1), Ctx) == 0 &&
-               "scalarizing intrinsic should be free");
-        return InstructionCost(0);
-      }
-      default:
-        break;
-      }
-
-    Type *ResultTy = Ctx.Types.inferScalarType(this);
-    InstructionCost ScalarCallCost =
-        Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
-    if (isSingleScalar()) {
-      if (CalledFn->isIntrinsic())
-        ScalarCallCost = std::min(
-            ScalarCallCost,
-            getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
-                                 ElementCount::getFixed(1), Ctx));
-      return ScalarCallCost;
-    }
-
-    return ScalarCallCost * VF.getFixedValue() +
-           Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
-  }
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::FAdd:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::FDiv:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::ICmp:
-  case Instruction::FCmp:
-    return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
-                                      Ctx) *
-           (isSingleScalar() ? 1 : VF.getFixedValue());
-  case Instruction::SDiv:
-  case Instruction::UDiv:
-  case Instruction::SRem:
-  case Instruction::URem: {
-    InstructionCost ScalarCost =
-        getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx);
-    if (isSingleScalar())
-      return ScalarCost;
-
-    ScalarCost = ScalarCost * VF.getFixedValue() +
-                 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
-                                              to_vector(operands()), VF);
-    // If the recipe is not predicated (i.e. not in a replicate region), return
-    // the scalar cost. Otherwise handle predicated cost.
-    if (!getRegion()->isReplicator())
-      return ScalarCost;
-
-    // Account for the phi nodes that we will create.
-    ScalarCost += VF.getFixedValue() *
-                  Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
-    // Scale the cost by the probability of executing the predicated blocks.
-    // This assumes the predicated block for each vector lane is equally
-    // likely.
-    ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
-    return ScalarCost;
-  }
-  case Instruction::Load:
-  case Instruction::Store: {
-    // TODO: See getMemInstScalarizationCost for how to handle replicating and
-    // predicated cases.
-    const VPRegionBlock *ParentRegion = getRegion();
-    if (ParentRegion && ParentRegion->isReplicator())
-      break;
-
-    bool IsLoad = UI->getOpcode() == Instruction::Load;
-    const VPValue *PtrOp = getOperand(!IsLoad);
-    const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
-    if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
-      break;
-
-    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
-    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
-    const Align Alignment = getLoadStoreAlignment(UI);
-    unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
-    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
-    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
-        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
-
-    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
-    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
-    bool UsedByLoadStoreAddress =
-        !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
-    InstructionCost ScalarCost =
-        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                              PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
-                              PtrSCEV, Ctx.CostKind);
-    if (isSingleScalar())
-      return ScalarCost;
-
-    SmallVector<const VPValue *> OpsToScalarize;
-    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
-    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
-    // don't assign scalarization overhead in general, if the target prefers
-    // vectorized addressing or the loaded value is used as part of an address
-    // of another load or store.
-    if (!UsedByLoadStoreAddress) {
-      bool EfficientVectorLoadStore =
-          Ctx.TTI.supportsEfficientVectorElementLoadStore();
-      if (!(IsLoad && !PreferVectorizedAddressing) &&
-          !(!IsLoad && EfficientVectorLoadStore))
-        append_range(OpsToScalarize, operands());
-
-      if (!EfficientVectorLoadStore)
-        ResultTy = Ctx.Types.inferScalarType(this);
-    }
-
-    return (ScalarCost * VF.getFixedValue()) +
-           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
-  }
-  }
-
-  return Ctx.getLegacyCost(UI, VF);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
-
-  if (!getUnderlyingInstr()->getType()->isVoidTy()) {
-    printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-  if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
-    O << "call";
-    printFlags(O);
-    O << "@" << CB->getCalledFunction()->getName() << "(";
-    interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
-                    O, [&O, &SlotTracker](VPValue *Op) {
-                      Op->printAsOperand(O, SlotTracker);
-                    });
-    O << ")";
-  } else {
-    O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
-    printFlags(O);
-    printOperands(O, SlotTracker);
-  }
-
-  if (shouldPack())
-    O << " (S->V)";
-}
-#endif
-
-void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
-  assert(State.Lane && "Branch on Mask works only on single instance.");
-
-  VPValue *BlockInMask = getOperand(0);
-  Value *ConditionBit = State.get(BlockInMask, *State.Lane);
-
-  // Replace the temporary unreachable terminator with a new conditional branch,
-  // whose two destinations will be set later when they are created.
-  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
-  assert(isa<UnreachableInst>(CurrentTerminator) &&
-         "Expected to replace unreachable terminator with conditional branch.");
-  auto CondBr =
-      State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
-  CondBr->setSuccessor(0, nullptr);
-  CurrentTerminator->eraseFromParent();
-}
-
-InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
-                                                  VPCostContext &Ctx) const {
-  // The legacy cost model doesn't assign costs to branches for individual
-  // replicate regions. Match the current behavior in the VPlan cost model for
-  // now.
-  return 0;
-}
-
-void VPPredInstPHIRecipe::execute(VPTransformState &State) {
-  assert(State.Lane && "Predicated instruction PHI works per instance.");
-  Instruction *ScalarPredInst =
-      cast<Instruction>(State.get(getOperand(0), *State.Lane));
-  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
-  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
-  assert(PredicatingBB && "Predicated block has no single predecessor.");
-  assert(isa<VPReplicateRecipe>(getOperand(0)) &&
-         "operand must be VPReplicateRecipe");
-
-  // By current pack/unpack logic we need to generate only a single phi node: if
-  // a vector value for the predicated instruction exists at this point it means
-  // the instruction has vector users only, and a phi for the vector value is
-  // needed. In this case the recipe of the predicated instruction is marked to
-  // also do that packing, thereby "hoisting" the insert-element sequence.
-  // Otherwise, a phi node for the scalar value is needed.
-  if (State.hasVectorValue(getOperand(0))) {
-    auto *VecI = cast<Instruction>(State.get(getOperand(0)));
-    assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
-           "Packed operands must generate an insertelement or insertvalue");
-
-    // If VectorI is a struct, it will be a sequence like:
-    // %1       = insertvalue %unmodified, %x, 0
-    // %2       = insertvalue %1, %y, 1
-    // %VectorI = insertvalue %2, %z, 2
-    // To get the unmodified vector we need to look through the chain.
-    if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
-      for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
-        VecI = cast<InsertValueInst>(VecI->getOperand(0));
-
-    PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
-    VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
-    VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
-    if (State.hasVectorValue(this))
-      State.reset(this, VPhi);
-    else
-      State.set(this, VPhi);
-    // NOTE: Currently we need to update the value of the operand, so the next
-    // predicated iteration inserts its generated value in the correct vector.
-    State.reset(getOperand(0), VPhi);
-  } else {
-    if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
-      return;
-
-    Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
-    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
-    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
-                     PredicatingBB);
-    Phi->addIncoming(ScalarPredInst, PredicatedBB);
-    if (State.hasScalarValue(this, *State.Lane))
-      State.reset(this, Phi, *State.Lane);
-    else
-      State.set(this, Phi, *State.Lane);
-    // NOTE: Currently we need to update the value of the operand, so the next
-    // predicated iteration inserts its generated value in the correct vector.
-    State.reset(getOperand(0), Phi, *State.Lane);
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                      VPSlotTracker &SlotTracker) const {
-  O << Indent << "PHI-PREDICATED-INSTRUCTION ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
-                                                 VPCostContext &Ctx) const {
-  Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
-  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
-                    ->getAddressSpace();
-  unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
-                        ? Instruction::Load
-                        : Instruction::Store;
-
-  if (!Consecutive) {
-    // TODO: Using the original IR may not be accurate.
-    // Currently, ARM will use the underlying IR to calculate gather/scatter
-    // instruction cost.
-    assert(!Reverse &&
-           "Inconsecutive memory access should not have the order.");
-
-    const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
-    Type *PtrTy = Ptr->getType();
-
-    // If the address value is uniform across all lanes, then the address can be
-    // calculated with scalar type and broadcast.
-    if (!vputils::isSingleScalar(getAddr()))
-      PtrTy = toVectorTy(PtrTy, VF);
-
-    unsigned IID = isa<VPWidenLoadRecipe>(this)      ? Intrinsic::masked_gather
-                   : isa<VPWidenStoreRecipe>(this)   ? Intrinsic::masked_scatter
-                   : isa<VPWidenLoadEVLRecipe>(this) ? Intrinsic::vp_gather
-                                                     : Intrinsic::vp_scatter;
-    return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
-                                             Ctx.CostKind) +
-           Ctx.TTI.getMemIntrinsicInstrCost(
-               MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
-                                          &Ingredient),
-               Ctx.CostKind);
-  }
-
-  InstructionCost Cost = 0;
-  if (IsMasked) {
-    unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
-                                                : Intrinsic::masked_store;
-    Cost += Ctx.TTI.getMemIntrinsicInstrCost(
-        MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
-  } else {
-    TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
-        isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
-                                                           : getOperand(1));
-    Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
-                                    OpInfo, &Ingredient);
-  }
-  if (!Reverse)
-    return Cost;
-
-  return Cost += Ctx.TTI.getShuffleCost(
-             TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
-             cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
-}
-
-void VPWidenLoadRecipe::execute(VPTransformState &State) {
-  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
-  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
-  bool CreateGather = !isConsecutive();
-
-  auto &Builder = State.Builder;
-  Value *Mask = nullptr;
-  if (auto *VPMask = getMask()) {
-    // Mask reversal is only needed for non-all-one (null) masks, as reverse
-    // of a null all-one mask is a null mask.
-    Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = Builder.CreateVectorReverse(Mask, "reverse");
-  }
-
-  Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
-  Value *NewLI;
-  if (CreateGather) {
-    NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
-                                       "wide.masked.gather");
-  } else if (Mask) {
-    NewLI =
-        Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
-                                 PoisonValue::get(DataTy), "wide.masked.load");
-  } else {
-    NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
-  }
-  applyMetadata(*cast<Instruction>(NewLI));
-  if (Reverse)
-    NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
-  State.set(this, NewLI);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                    VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN ";
-  printAsOperand(O, SlotTracker);
-  O << " = load ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
-                                     Value *EVL, const Twine &Name) {
-  VectorType *ValTy = cast<VectorType>(Operand->getType());
-  Value *AllTrueMask =
-      Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
-  return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
-                                 {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
-void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
-  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
-  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
-  bool CreateGather = !isConsecutive();
-
-  auto &Builder = State.Builder;
-  CallInst *NewLI;
-  Value *EVL = State.get(getEVL(), VPLane(0));
-  Value *Addr = State.get(getAddr(), !CreateGather);
-  Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask()) {
-    Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
-  } else {
-    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  }
-
-  if (CreateGather) {
-    NewLI =
-        Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
-                                nullptr, "wide.masked.gather");
-  } else {
-    NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
-                                    {Addr, Mask, EVL}, nullptr, "vp.op.load");
-  }
-  NewLI->addParamAttr(
-      0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
-  applyMetadata(*NewLI);
-  Instruction *Res = NewLI;
-  if (isReverse())
-    Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
-  State.set(this, Res);
-}
-
-InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
-                                                  VPCostContext &Ctx) const {
-  if (!Consecutive || IsMasked)
-    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
-
-  // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
-  // here because the EVL recipes using EVL to replace the tail mask. But in the
-  // legacy model, it will always calculate the cost of mask.
-  // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost  when we
-  // don't need to compare to the legacy cost model.
-  Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
-  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
-                    ->getAddressSpace();
-  InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
-      MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
-      Ctx.CostKind);
-  if (!Reverse)
-    return Cost;
-
-  return Cost + Ctx.TTI.getShuffleCost(
-                    TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
-                    cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                       VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN ";
-  printAsOperand(O, SlotTracker);
-  O << " = vp.load ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPWidenStoreRecipe::execute(VPTransformState &State) {
-  VPValue *StoredVPValue = getStoredValue();
-  bool CreateScatter = !isConsecutive();
-
-  auto &Builder = State.Builder;
-
-  Value *Mask = nullptr;
-  if (auto *VPMask = getMask()) {
-    // Mask reversal is only needed for non-all-one (null) masks, as reverse
-    // of a null all-one mask is a null mask.
-    Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = Builder.CreateVectorReverse(Mask, "reverse");
-  }
-
-  Value *StoredVal = State.get(StoredVPValue);
-  if (isReverse()) {
-    // If we store to reverse consecutive memory locations, then we need
-    // to reverse the order of elements in the stored value.
-    StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
-    // We don't want to update the value in the map as it might be used in
-    // another expression. So don't call resetVectorValue(StoredVal).
-  }
-  Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
-  Instruction *NewSI = nullptr;
-  if (CreateScatter)
-    NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
-  else if (Mask)
-    NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
-  else
-    NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
-  applyMetadata(*NewSI);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN store ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
-  VPValue *StoredValue = getStoredValue();
-  bool CreateScatter = !isConsecutive();
-
-  auto &Builder = State.Builder;
-
-  CallInst *NewSI = nullptr;
-  Value *StoredVal = State.get(StoredValue);
-  Value *EVL = State.get(getEVL(), VPLane(0));
-  if (isReverse())
-    StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
-  Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask()) {
-    Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
-  } else {
-    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  }
-  Value *Addr = State.get(getAddr(), !CreateScatter);
-  if (CreateScatter) {
-    NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
-                                    Intrinsic::vp_scatter,
-                                    {StoredVal, Addr, Mask, EVL});
-  } else {
-    NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
-                                    Intrinsic::vp_store,
-                                    {StoredVal, Addr, Mask, EVL});
-  }
-  NewSI->addParamAttr(
-      1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
-  applyMetadata(*NewSI);
-}
-
-InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
-                                                   VPCostContext &Ctx) const {
-  if (!Consecutive || IsMasked)
-    return VPWidenMemoryRecipe::computeCost(VF, Ctx);
-
-  // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
-  // here because the EVL recipes using EVL to replace the tail mask. But in the
-  // legacy model, it will always calculate the cost of mask.
-  // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
-  // don't need to compare to the legacy cost model.
-  Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
-  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
-                    ->getAddressSpace();
-  InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
-      MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
-      Ctx.CostKind);
-  if (!Reverse)
-    return Cost;
-
-  return Cost + Ctx.TTI.getShuffleCost(
-                    TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
-                    cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                        VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN vp.store ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
-                                     VectorType *DstVTy, const DataLayout &DL) {
-  // Verify that V is a vector type with same number of elements as DstVTy.
-  auto VF = DstVTy->getElementCount();
-  auto *SrcVecTy = cast<VectorType>(V->getType());
-  assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
-  Type *SrcElemTy = SrcVecTy->getElementType();
-  Type *DstElemTy = DstVTy->getElementType();
-  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
-         "Vector elements must have same size");
-
-  // Do a direct cast if element types are castable.
-  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
-    return Builder.CreateBitOrPointerCast(V, DstVTy);
-  }
-  // V cannot be directly casted to desired vector type.
-  // May happen when V is a floating point vector but DstVTy is a vector of
-  // pointers or vice-versa. Handle this using a two-step bitcast using an
-  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
-  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
-         "Only one type should be a pointer type");
-  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
-         "Only one type should be a floating point type");
-  Type *IntTy =
-      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
-  auto *VecIntTy = VectorType::get(IntTy, VF);
-  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
-  return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
-}
-
-/// Return a vector containing interleaved elements from multiple
-/// smaller input vectors.
-static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
-                                const Twine &Name) {
-  unsigned Factor = Vals.size();
-  assert(Factor > 1 && "Tried to interleave invalid number of vectors");
-
-  VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
-#ifndef NDEBUG
-  for (Value *Val : Vals)
-    assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
-#endif
-
-  // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
-  // must use intrinsics to interleave.
-  if (VecTy->isScalableTy()) {
-    assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
-    return Builder.CreateVectorInterleave(Vals, Name);
-  }
-
-  // Fixed length. Start by concatenating all vectors into a wide vector.
-  Value *WideVec = concatenateVectors(Builder, Vals);
-
-  // Interleave the elements into the wide vector.
-  const unsigned NumElts = VecTy->getElementCount().getFixedValue();
-  return Builder.CreateShuffleVector(
-      WideVec, createInterleaveMask(NumElts, Factor), Name);
-}
-
-// Try to vectorize the interleave group that \p Instr belongs to.
-//
-// E.g. Translate following interleaved load group (factor = 3):
-//   for (i = 0; i < N; i+=3) {
-//     R = Pic[i];             // Member of index 0
-//     G = Pic[i+1];           // Member of index 1
-//     B = Pic[i+2];           // Member of index 2
-//     ... // do something to R, G, B
-//   }
-// To:
-//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
-//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
-//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
-//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
-//
-// Or translate following interleaved store group (factor = 3):
-//   for (i = 0; i < N; i+=3) {
-//     ... do something to R, G, B
-//     Pic[i]   = R;           // Member of index 0
-//     Pic[i+1] = G;           // Member of index 1
-//     Pic[i+2] = B;           // Member of index 2
-//   }
-// To:
-//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
-//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
-//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
-//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
-//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
-void VPInterleaveRecipe::execute(VPTransformState &State) {
-  assert(!State.Lane && "Interleave group being replicated.");
-  assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
-         "Masking gaps for scalable vectors is not yet supported.");
-  const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
-  Instruction *Instr = Group->getInsertPos();
-
-  // Prepare for the vector type of the interleaved load/store.
-  Type *ScalarTy = getLoadStoreType(Instr);
-  unsigned InterleaveFactor = Group->getFactor();
-  auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
-
-  VPValue *BlockInMask = getMask();
-  VPValue *Addr = getAddr();
-  Value *ResAddr = State.get(Addr, VPLane(0));
-
-  auto CreateGroupMask = [&BlockInMask, &State,
-                          &InterleaveFactor](Value *MaskForGaps) -> Value * {
-    if (State.VF.isScalable()) {
-      assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(InterleaveFactor <= 8 &&
-             "Unsupported deinterleave factor for scalable vectors");
-      auto *ResBlockInMask = State.get(BlockInMask);
-      SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
-      return interleaveVectors(State.Builder, Ops, "interleaved.mask");
-    }
-
-    if (!BlockInMask)
-      return MaskForGaps;
-
-    Value *ResBlockInMask = State.get(BlockInMask);
-    Value *ShuffledMask = State.Builder.CreateShuffleVector(
-        ResBlockInMask,
-        createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
-        "interleaved.mask");
-    return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
-                                                   ShuffledMask, MaskForGaps)
-                       : ShuffledMask;
-  };
-
-  const DataLayout &DL = Instr->getDataLayout();
-  // Vectorize the interleaved load group.
-  if (isa<LoadInst>(Instr)) {
-    Value *MaskForGaps = nullptr;
-    if (needsMaskForGaps()) {
-      MaskForGaps =
-          createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
-      assert(MaskForGaps && "Mask for Gaps is required but it is null");
-    }
-
-    Instruction *NewLoad;
-    if (BlockInMask || MaskForGaps) {
-      Value *GroupMask = CreateGroupMask(MaskForGaps);
-      Value *PoisonVec = PoisonValue::get(VecTy);
-      NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
-                                               Group->getAlign(), GroupMask,
-                                               PoisonVec, "wide.masked.vec");
-    } else
-      NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
-                                                Group->getAlign(), "wide.vec");
-    applyMetadata(*NewLoad);
-    // TODO: Also manage existing metadata using VPIRMetadata.
-    Group->addMetadata(NewLoad);
-
-    ArrayRef<VPValue *> VPDefs = definedValues();
-    if (VecTy->isScalableTy()) {
-      // Scalable vectors cannot use arbitrary shufflevectors (only splats),
-      // so must use intrinsics to deinterleave.
-      assert(InterleaveFactor <= 8 &&
-             "Unsupported deinterleave factor for scalable vectors");
-      NewLoad = State.Builder.CreateIntrinsic(
-          Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
-          NewLoad->getType(), NewLoad,
-          /*FMFSource=*/nullptr, "strided.vec");
-    }
-
-    auto CreateStridedVector = [&InterleaveFactor, &State,
-                                &NewLoad](unsigned Index) -> Value * {
-      assert(Index < InterleaveFactor && "Illegal group index");
-      if (State.VF.isScalable())
-        return State.Builder.CreateExtractValue(NewLoad, Index);
-
-      // For fixed length VF, use shuffle to extract the sub-vectors from the
-      // wide load.
-      auto StrideMask =
-          createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
-      return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
-                                               "strided.vec");
-    };
-
-    for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
-      Instruction *Member = Group->getMember(I);
-
-      // Skip the gaps in the group.
-      if (!Member)
-        continue;
-
-      Value *StridedVec = CreateStridedVector(I);
-
-      // If this member has different type, cast the result type.
-      if (Member->getType() != ScalarTy) {
-        VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
-        StridedVec =
-            createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
-      }
-
-      if (Group->isReverse())
-        StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
-
-      State.set(VPDefs[J], StridedVec);
-      ++J;
-    }
-    return;
-  }
-
-  // The sub vector type for current instruction.
-  auto *SubVT = VectorType::get(ScalarTy, State.VF);
-
-  // Vectorize the interleaved store group.
-  Value *MaskForGaps =
-      createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
-  assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
-         "Mismatch between NeedsMaskForGaps and MaskForGaps");
-  ArrayRef<VPValue *> StoredValues = getStoredValues();
-  // Collect the stored vector from each member.
-  SmallVector<Value *, 4> StoredVecs;
-  unsigned StoredIdx = 0;
-  for (unsigned i = 0; i < InterleaveFactor; i++) {
-    assert((Group->getMember(i) || MaskForGaps) &&
-           "Fail to get a member from an interleaved store group");
-    Instruction *Member = Group->getMember(i);
-
-    // Skip the gaps in the group.
-    if (!Member) {
-      Value *Undef = PoisonValue::get(SubVT);
-      StoredVecs.push_back(Undef);
-      continue;
-    }
-
-    Value *StoredVec = State.get(StoredValues[StoredIdx]);
-    ++StoredIdx;
-
-    if (Group->isReverse())
-      StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
-
-    // If this member has different type, cast it to a unified type.
-
-    if (StoredVec->getType() != SubVT)
-      StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
-
-    StoredVecs.push_back(StoredVec);
-  }
-
-  // Interleave all the smaller vectors into one wider vector.
-  Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
-  Instruction *NewStoreInstr;
-  if (BlockInMask || MaskForGaps) {
-    Value *GroupMask = CreateGroupMask(MaskForGaps);
-    NewStoreInstr = State.Builder.CreateMaskedStore(
-        IVec, ResAddr, Group->getAlign(), GroupMask);
-  } else
-    NewStoreInstr =
-        State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
-
-  applyMetadata(*NewStoreInstr);
-  // TODO: Also manage existing metadata using VPIRMetadata.
-  Group->addMetadata(NewStoreInstr);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
-  const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
-  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
-  IG->getInsertPos()->printAsOperand(O, false);
-  O << ", ";
-  getAddr()->printAsOperand(O, SlotTracker);
-  VPValue *Mask = getMask();
-  if (Mask) {
-    O << ", ";
-    Mask->printAsOperand(O, SlotTracker);
-  }
-
-  unsigned OpIdx = 0;
-  for (unsigned i = 0; i < IG->getFactor(); ++i) {
-    if (!IG->getMember(i))
-      continue;
-    if (getNumStoreOperands() > 0) {
-      O << "\n" << Indent << "  store ";
-      getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
-      O << " to index " << i;
-    } else {
-      O << "\n" << Indent << "  ";
-      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
-      O << " = load from index " << i;
-    }
-    ++OpIdx;
-  }
-}
-#endif
-
-void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
-  assert(!State.Lane && "Interleave group being replicated.");
-  assert(State.VF.isScalable() &&
-         "Only support scalable VF for EVL tail-folding.");
-  assert(!needsMaskForGaps() &&
-         "Masking gaps for scalable vectors is not yet supported.");
-  const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
-  Instruction *Instr = Group->getInsertPos();
-
-  // Prepare for the vector type of the interleaved load/store.
-  Type *ScalarTy = getLoadStoreType(Instr);
-  unsigned InterleaveFactor = Group->getFactor();
-  assert(InterleaveFactor <= 8 &&
-         "Unsupported deinterleave/interleave factor for scalable vectors");
-  ElementCount WideVF = State.VF * InterleaveFactor;
-  auto *VecTy = VectorType::get(ScalarTy, WideVF);
-
-  VPValue *Addr = getAddr();
-  Value *ResAddr = State.get(Addr, VPLane(0));
-  Value *EVL = State.get(getEVL(), VPLane(0));
-  Value *InterleaveEVL = State.Builder.CreateMul(
-      EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
-      /* NUW= */ true, /* NSW= */ true);
-  LLVMContext &Ctx = State.Builder.getContext();
-
-  Value *GroupMask = nullptr;
-  if (VPValue *BlockInMask = getMask()) {
-    SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
-    GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
-  } else {
-    GroupMask =
-        State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
-  }
-
-  // Vectorize the interleaved load group.
-  if (isa<LoadInst>(Instr)) {
-    CallInst *NewLoad = State.Builder.CreateIntrinsic(
-        VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
-        "wide.vp.load");
-    NewLoad->addParamAttr(0,
-                          Attribute::getWithAlignment(Ctx, Group->getAlign()));
-
-    applyMetadata(*NewLoad);
-    // TODO: Also manage existing metadata using VPIRMetadata.
-    Group->addMetadata(NewLoad);
-
-    // Scalable vectors cannot use arbitrary shufflevectors (only splats),
-    // so must use intrinsics to deinterleave.
-    NewLoad = State.Builder.CreateIntrinsic(
-        Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
-        NewLoad->getType(), NewLoad,
-        /*FMFSource=*/nullptr, "strided.vec");
-
-    const DataLayout &DL = Instr->getDataLayout();
-    for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
-      Instruction *Member = Group->getMember(I);
-      // Skip the gaps in the group.
-      if (!Member)
-        continue;
-
-      Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
-      // If this member has different type, cast the result type.
-      if (Member->getType() != ScalarTy) {
-        VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
-        StridedVec =
-            createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
-      }
-
-      State.set(getVPValue(J), StridedVec);
-      ++J;
-    }
-    return;
-  } // End for interleaved load.
-
-  // The sub vector type for current instruction.
-  auto *SubVT = VectorType::get(ScalarTy, State.VF);
-  // Vectorize the interleaved store group.
-  ArrayRef<VPValue *> StoredValues = getStoredValues();
-  // Collect the stored vector from each member.
-  SmallVector<Value *, 4> StoredVecs;
-  const DataLayout &DL = Instr->getDataLayout();
-  for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
-    Instruction *Member = Group->getMember(I);
-    // Skip the gaps in the group.
-    if (!Member) {
-      StoredVecs.push_back(PoisonValue::get(SubVT));
-      continue;
-    }
-
-    Value *StoredVec = State.get(StoredValues[StoredIdx]);
-    // If this member has different type, cast it to a unified type.
-    if (StoredVec->getType() != SubVT)
-      StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
-
-    StoredVecs.push_back(StoredVec);
-    ++StoredIdx;
-  }
-
-  // Interleave all the smaller vectors into one wider vector.
-  Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
-  CallInst *NewStore =
-      State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
-                                    {IVec, ResAddr, GroupMask, InterleaveEVL});
-  NewStore->addParamAttr(1,
-                         Attribute::getWithAlignment(Ctx, Group->getAlign()));
-
-  applyMetadata(*NewStore);
-  // TODO: Also manage existing metadata using VPIRMetadata.
-  Group->addMetadata(NewStore);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                        VPSlotTracker &SlotTracker) const {
-  const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
-  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
-  IG->getInsertPos()->printAsOperand(O, false);
-  O << ", ";
-  getAddr()->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getEVL()->printAsOperand(O, SlotTracker);
-  if (VPValue *Mask = getMask()) {
-    O << ", ";
-    Mask->printAsOperand(O, SlotTracker);
-  }
-
-  unsigned OpIdx = 0;
-  for (unsigned i = 0; i < IG->getFactor(); ++i) {
-    if (!IG->getMember(i))
-      continue;
-    if (getNumStoreOperands() > 0) {
-      O << "\n" << Indent << "  vp.store ";
-      getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
-      O << " to index " << i;
-    } else {
-      O << "\n" << Indent << "  ";
-      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
-      O << " = vp.load from index " << i;
-    }
-    ++OpIdx;
-  }
-}
-#endif
-
-InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
-                                              VPCostContext &Ctx) const {
-  Instruction *InsertPos = getInsertPos();
-  // Find the VPValue index of the interleave group. We need to skip gaps.
-  unsigned InsertPosIdx = 0;
-  for (unsigned Idx = 0; IG->getFactor(); ++Idx)
-    if (auto *Member = IG->getMember(Idx)) {
-      if (Member == InsertPos)
-        break;
-      InsertPosIdx++;
-    }
-  Type *ValTy = Ctx.Types.inferScalarType(
-      getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
-                                : getStoredValues()[InsertPosIdx]);
-  auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
-  unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
-                    ->getAddressSpace();
-
-  unsigned InterleaveFactor = IG->getFactor();
-  auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
-
-  // Holds the indices of existing members in the interleaved group.
-  SmallVector<unsigned, 4> Indices;
-  for (unsigned IF = 0; IF < InterleaveFactor; IF++)
-    if (IG->getMember(IF))
-      Indices.push_back(IF);
-
-  // Calculate the cost of the whole interleaved group.
-  InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
-      InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
-      IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
-
-  if (!IG->isReverse())
-    return Cost;
-
-  return Cost + IG->getNumMembers() *
-                    Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                           VectorTy, VectorTy, {}, Ctx.CostKind,
-                                           0);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                         VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT ";
-  printAsOperand(O, SlotTracker);
-  O << " = CANONICAL-INDUCTION ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
-  return vputils::onlyScalarValuesUsed(this) &&
-         (!IsScalable || vputils::onlyFirstLaneUsed(this));
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenPointerInductionRecipe::printRecipe(
-    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
-  assert((getNumOperands() == 3 || getNumOperands() == 5) &&
-         "unexpected number of operands");
-  O << Indent << "EMIT ";
-  printAsOperand(O, SlotTracker);
-  O << " = WIDEN-POINTER-INDUCTION ";
-  getStartValue()->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getStepValue()->printAsOperand(O, SlotTracker);
-  O << ", ";
-  getOperand(2)->printAsOperand(O, SlotTracker);
-  if (getNumOperands() == 5) {
-    O << ", ";
-    getOperand(3)->printAsOperand(O, SlotTracker);
-    O << ", ";
-    getOperand(4)->printAsOperand(O, SlotTracker);
-  }
-}
-
-void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT ";
-  printAsOperand(O, SlotTracker);
-  O << " = EXPAND SCEV " << *Expr;
-}
-#endif
-
-void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
-  Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
-  Type *STy = CanonicalIV->getType();
-  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
-  ElementCount VF = State.VF;
-  Value *VStart = VF.isScalar()
-                      ? CanonicalIV
-                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
-  Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
-  if (VF.isVector()) {
-    VStep = Builder.CreateVectorSplat(VF, VStep);
-    VStep =
-        Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
-  }
-  Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
-  State.set(this, CanonicalVectorIV);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                           VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT ";
-  printAsOperand(O, SlotTracker);
-  O << " = WIDEN-CANONICAL-INDUCTION ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
-  auto &Builder = State.Builder;
-  // Create a vector from the initial value.
-  auto *VectorInit = getStartValue()->getLiveInIRValue();
-
-  Type *VecTy = State.VF.isScalar()
-                    ? VectorInit->getType()
-                    : VectorType::get(VectorInit->getType(), State.VF);
-
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  if (State.VF.isVector()) {
-    auto *IdxTy = Builder.getInt32Ty();
-    auto *One = ConstantInt::get(IdxTy, 1);
-    IRBuilder<>::InsertPointGuard Guard(Builder);
-    Builder.SetInsertPoint(VectorPH->getTerminator());
-    auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
-    auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
-    VectorInit = Builder.CreateInsertElement(
-        PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
-  }
-
-  // Create a phi node for the new recurrence.
-  PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
-  Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
-  Phi->addIncoming(VectorInit, VectorPH);
-  State.set(this, Phi);
-}
-
-InstructionCost
-VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
-                                             VPCostContext &Ctx) const {
-  if (VF.isScalar())
-    return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
-
-  return 0;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPFirstOrderRecurrencePHIRecipe::printRecipe(
-    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
-  O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-void VPReductionPHIRecipe::execute(VPTransformState &State) {
-  // Reductions do not have to start at zero. They can start with
-  // any loop invariant values.
-  VPValue *StartVPV = getStartValue();
-
-  // In order to support recurrences we need to be able to vectorize Phi nodes.
-  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
-  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
-  // this value when we vectorize all of the instructions that use the PHI.
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  bool ScalarPHI = State.VF.isScalar() || isInLoop();
-  Value *StartV = State.get(StartVPV, ScalarPHI);
-  Type *VecTy = StartV->getType();
-
-  BasicBlock *HeaderBB = State.CFG.PrevBB;
-  assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
-         "recipe must be in the vector loop header");
-  auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
-  Phi->insertBefore(HeaderBB->getFirstInsertionPt());
-  State.set(this, Phi, isInLoop());
-
-  Phi->addIncoming(StartV, VectorPH);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                       VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-REDUCTION-PHI ";
-
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printOperands(O, SlotTracker);
-  if (getVFScaleFactor() > 1)
-    O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
-}
-#endif
-
-void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  Value *Op0 = State.get(getOperand(0));
-  Type *VecTy = Op0->getType();
-  Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
-  State.set(this, VecPhi);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                   VPSlotTracker &SlotTracker) const {
-  O << Indent << "WIDEN-PHI ";
-
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printPhiOperands(O, SlotTracker);
-}
-#endif
-
-void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  Value *StartMask = State.get(getOperand(0));
-  PHINode *Phi =
-      State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
-  Phi->addIncoming(StartMask, VectorPH);
-  State.set(this, Phi);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                            VPSlotTracker &SlotTracker) const {
-  O << Indent << "ACTIVE-LANE-MASK-PHI ";
-
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
-                                        VPSlotTracker &SlotTracker) const {
-  O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
-
-  printAsOperand(O, SlotTracker);
-  O << " = phi ";
-  printOperands(O, SlotTracker);
-}
-#endif