diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e..2acd0117decfb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1672,6 +1672,9 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  /// \returns Whether vector operations are a good candidate for vector widen.
+  bool considerToWiden(LLVMContext &Context, ArrayRef<Instruction *> IL) const;
+
   /// @}
 
 private:
@@ -2041,6 +2044,8 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual bool considerToWiden(LLVMContext &Context,
+                               ArrayRef<Instruction *> IL) const = 0;
 };
 
 template <typename T>
@@ -2757,6 +2762,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  bool considerToWiden(LLVMContext &Context,
+                       ArrayRef<Instruction *> IL) const override {
+    return Impl.considerToWiden(Context, IL);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c1ff314ae51c9..5278c9d4dc8c3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -895,6 +895,10 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool considerToWiden(LLVMContext &Context, ArrayRef<Instruction *> IL) const {
+    return false;
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h b/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h
new file mode 100644
index 0000000000000..6988785a92ce0
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h
@@ -0,0 +1,25 @@
+//===--- VectorWiden.h - Combining Vector Operations to wider types ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H
+#define LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class VectorWidenPass : public PassInfoMixin<VectorWidenPass> {
+public:
+  VectorWidenPass() {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index aad14f21d1146..d01ed739a5a71 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1248,6 +1248,11 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::considerToWiden(LLVMContext &Context,
+                                          ArrayRef<Instruction *> IL) const {
+  return TTIImpl->considerToWiden(Context, IL);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 985ff88139323..8ef983e5d8651 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -233,8 +233,8 @@
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
-#include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/DXILUpgrade.h"
+#include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/FixIrreducible.h"
 #include "llvm/Transforms/Utils/HelloWorld.h"
@@ -263,6 +263,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/Transforms/Vectorize/VectorWiden.h"
 #include <optional>
 
 using namespace llvm;
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index df9f14920f291..2eef2f0a22d95 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -428,6 +428,7 @@ FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("vector-widen", VectorWidenPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cded28054f592..cecf02ee250b0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2426,6 +2426,14 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
             CostKind, I));
   }
 
+  static const TypeConversionCostTblEntry SME2Tbl[] = {
+      {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 1}};
+
+  if (ST->hasSME2())
+    if (const auto *Entry = ConvertCostTableLookup(
+            SME2Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+      return AdjustCost(Entry->Cost);
+
   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
                                                  DstTy.getSimpleVT(),
                                                  SrcTy.getSimpleVT()))
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77..9afba1ec17ab7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -412,6 +412,26 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
     return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
   }
+
+  bool considerToWiden(LLVMContext &Context, ArrayRef<Instruction *> IL) const {
+    unsigned Opcode = IL[0]->getOpcode();
+    Type *Ty = IL[0]->getType();
+    if (!ST->hasSME2())
+      return false;
+    if (llvm::any_of(IL, [Opcode, Ty](Instruction *I) {
+          return (Opcode != I->getOpcode() || Ty != I->getType());
+        }))
+      return false;
+    if (Opcode == Instruction::FPTrunc &&
+        Ty == ScalableVectorType::get(Type::getHalfTy(Context), 4))
+      return true;
+    if (Opcode == Instruction::Add &&
+        Ty == ScalableVectorType::get(Type::getInt32Ty(Context), 4) &&
+        (IL[0]->getOperand(1) == IL[1]->getOperand(1) ||
+         IL[0]->getOperand(0) == IL[1]->getOperand(0)))
+      return true;
+    return false;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 998dfd956575d..a1537bb1ffa63 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize
   SLPVectorizer.cpp
   Vectorize.cpp
   VectorCombine.cpp
+  VectorWiden.cpp
   VPlan.cpp
   VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
diff --git a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
new file mode 100644
index 0000000000000..2b7e7eaa77840
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
@@ -0,0 +1,429 @@
+//===--- VectorWiden.cpp - Combining Vector Operations to wider types ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to widen vector operations to a wider type, it finds
+// independent from each other operations with a certain vector type as SLP does
+// with scalars by Bottom Up. It detects consecutive stores that can be put
+// together into a wider vector-stores. Next, it attempts to construct
+// vectorizable tree using the use-def chains.
+//
+//==------------------------------------------------------------------------==//
+
+#include "llvm/Transforms/Vectorize/VectorWiden.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vector-widen"
+
+// Due to independant operations to widening that we consider with possibility
+// to merge those operations into one and also to widening store if we find
+// later store instructions. We have to consider the distance between those
+// independent operations or we might introduce bad register pressure, etc.
+
+static cl::opt<unsigned>
+    MaxInstDistance("vw-max-instr-distance", cl::init(30), cl::Hidden,
+                    cl::desc("Maximum distance between instructions to"
+                             "consider to widen"));
+
+static cl::opt<bool> OverrideTargetConsiderToWiden(
+    "vw-override-target-consider-to-widen", cl::init(false), cl::Hidden,
+    cl::desc("Ignore any target information while considoring to widen"));
+
+namespace {
+class VectorWiden {
+public:
+  using InstrList = SmallVector<Instruction *, 2>;
+  using ValueList = SmallVector<Value *, 2>;
+  VectorWiden(Function &F, const TargetTransformInfo &TTI)
+      : F(F), Builder(F.getContext()), TTI(TTI) {}
+
+  bool run();
+
+private:
+  Function &F;
+  IRBuilder<> Builder;
+  const TargetTransformInfo &TTI;
+  TargetLibraryInfo *TLI;
+
+  DenseSet<Instruction *> DeletedInstructions;
+
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Removes an instruction from its block and eventually deletes it.
+  void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); }
+
+  bool processBB(BasicBlock &BB, LLVMContext &Context);
+
+  bool canWidenNode(ArrayRef<Instruction *> IL, LLVMContext &Context);
+
+  bool widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context);
+
+  void widenCastInst(ArrayRef<Instruction *> IL);
+
+  void widenBinaryOperator(ArrayRef<Instruction *> IL);
+
+  InstructionCost getOpCost(unsigned Opcode, Type *To, Type *From,
+                            Instruction *I);
+};
+} // namespace
+
+void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+  auto *RetOrigType = cast<VectorType>(I->getType());
+  auto *OrigType = cast<VectorType>(I->getOperand(0)->getType());
+  auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+  auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+
+  bool isBitCast = I->getOpcode() == Instruction::BitCast;
+  unsigned Offset =
+      dyn_cast<ScalableVectorType>(OrigType)
+          ? (cast<ScalableVectorType>(OrigType))->getMinNumElements()
+          : (cast<FixedVectorType>(OrigType))->getNumElements();
+  unsigned BitCastOffsetExtract =
+      (dyn_cast<ScalableVectorType>(RetType)
+           ? (cast<ScalableVectorType>(RetType))->getMinNumElements()
+           : (cast<FixedVectorType>(RetType))->getNumElements()) /
+      2;
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *Insert1 = Builder.CreateCall(
+      InsertIntr, {WideVec, I->getOperand(0), Builder.getInt64(0)});
+  Value *Insert2 = Builder.CreateCall(
+      InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(Offset)});
+  Value *ResCast = Builder.CreateCast(Instruction::CastOps(I->getOpcode()),
+                                      Insert2, RetType);
+
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res =
+        Builder.CreateCall(ExtractIntr, {ResCast, Builder.getInt64(0)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res = Builder.CreateCall(
+        ExtractIntr,
+        {ResCast, Builder.getInt64(isBitCast ? BitCastOffsetExtract : Offset)});
+    I1->replaceAllUsesWith(Res);
+  }
+}
+
+void VectorWiden::widenBinaryOperator(ArrayRef<Instruction *> IL) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+
+  Value *XHi = I->getOperand(0);
+  Value *XLo = I1->getOperand(0);
+  Value *YHi = I->getOperand(1);
+  Value *YLo = I1->getOperand(1);
+
+  auto *RetOrigType = cast<VectorType>(I->getType());
+  auto *OrigType = cast<VectorType>(I->getOperand(0)->getType());
+  auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+  auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+  unsigned Offset =
+      dyn_cast<ScalableVectorType>(OrigType)
+          ? (cast<ScalableVectorType>(OrigType))->getMinNumElements()
+          : (cast<FixedVectorType>(OrigType))->getNumElements();
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *X1 =
+      Builder.CreateCall(InsertIntr, {WideVec, XLo, Builder.getInt64(0)});
+  Value *X2 =
+      Builder.CreateCall(InsertIntr, {X1, XHi, Builder.getInt64(Offset)});
+  Value *Y1 =
+      Builder.CreateCall(InsertIntr, {WideVec, YLo, Builder.getInt64(0)});
+  Value *Y2 =
+      Builder.CreateCall(InsertIntr, {Y1, YHi, Builder.getInt64(Offset)});
+  Value *ResBinOp =
+      Builder.CreateBinOp((Instruction::BinaryOps)I->getOpcode(), X2, Y2);
+  ValueList VL;
+  for (Instruction *I : IL)
+    VL.push_back(I);
+
+  propagateIRFlags(ResBinOp, VL);
+
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res =
+        Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(Offset)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res =
+        Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(0)});
+    I1->replaceAllUsesWith(Res);
+  }
+}
+
+bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
+                               LLVMContext &Context) {
+  if (!OverrideTargetConsiderToWiden && !TTI.considerToWiden(Context, IL))
+    return false;
+
+  bool HasSecondOperand = IL[0]->getNumOperands() > 1;
+  for (int X = 0, E = IL.size(); X < E; X++) {
+    for (int Y = 0, E = IL.size(); Y < E; Y++) {
+      if (X == Y)
+        continue;
+      if (IL[X] == IL[Y] || IL[X]->getOperand(0) == IL[Y] ||
+          (HasSecondOperand && IL[X]->getOperand(1) == IL[Y]))
+        return false;
+    }
+    if (isDeleted(IL[X]) || !IL[X]->hasOneUse())
+      return false;
+    if (X == 0)
+      continue;
+    if (IL[X]->getOpcode() != IL[X - 1]->getOpcode() ||
+        // Ignore if any types are different.
+        IL[X]->getType() != IL[X - 1]->getType() ||
+        IL[X]->getOperand(0)->getType() !=
+            IL[X - 1]->getOperand(0)->getType() ||
+        IL[X - 1]->comesBefore(IL[X]))
+      return false;
+    if (IL[0]->getParent() == IL[X]->user_back()->getParent() &&
+        IL[X]->user_back()->comesBefore(IL[0]))
+      return false;
+  }
+  return true;
+}
+
+bool VectorWiden::widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context) {
+  // Currently, this pass supports only two operations to widen to
+  // a single operation.
+  if (IL.size() != 2)
+    return false;
+  if (!canWidenNode(IL, Context))
+    return false;
+
+  unsigned Opcode = IL[0]->getOpcode();
+
+  if (dyn_cast<CastInst>(IL[0])) {
+    if (!OverrideTargetConsiderToWiden) {
+      auto *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
+      auto *RetOrigType = cast<VectorType>(IL[0]->getType());
+      InstructionCost Cost = getOpCost(Opcode, RetOrigType, OrigType, IL[0]);
+      auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+      auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+      InstructionCost CostNew = getOpCost(Opcode, RetType, OpType, IL[0]);
+      if (2 * Cost < CostNew)
+        return false;
+    }
+    LLVM_DEBUG(
+        dbgs()
+        << "VW: Decided to widen CastInst, safe to merge node starting with "
+        << *IL[0] << "\n");
+    widenCastInst(IL);
+    return true;
+  }
+  if (dyn_cast<BinaryOperator>(IL[0])) {
+    if (!OverrideTargetConsiderToWiden) {
+      auto *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
+      auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+      InstructionCost Cost = getOpCost(Opcode, OrigType, OrigType, IL[0]);
+      InstructionCost CostNew = getOpCost(Opcode, OpType, OpType, IL[0]);
+      if (2 * Cost < CostNew)
+        return false;
+    }
+    LLVM_DEBUG(
+        dbgs()
+        << "VW: Decided to widen BinaryOp, safe to merge node starting with "
+        << *IL[0] << "\n");
+    // We want to propagate here IR flags for the group of operations like
+    // "fast" flag for float pointer ones or "nuw" for integer instructions.
+    widenBinaryOperator(IL);
+    return true;
+  }
+  return false;
+}
+
+InstructionCost VectorWiden::getOpCost(unsigned Opcode, Type *To, Type *From,
+                                       Instruction *I) {
+  InstructionCost Cost = 0;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  if (dyn_cast<BinaryOperator>(I)) {
+    unsigned OpIdx = isa<UnaryOperator>(I) ? 0 : 1;
+    TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(I->getOperand(0));
+    TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(I->getOperand(OpIdx));
+    SmallVector<const Value *> Operands(I->operand_values());
+    Cost = TTI.getArithmeticInstrCost(I->getOpcode(), To, CostKind, Op1Info,
+                                      Op2Info, Operands, I);
+  } else if (dyn_cast<CastInst>(I)) {
+    Cost = TTI.getCastInstrCost(Opcode, To, From, TTI::getCastContextHint(I),
+                                CostKind, I);
+  }
+  return Cost;
+}
+
+static bool isOperationSupported(Instruction *I) {
+  unsigned Opcode = I->getOpcode();
+  // Currently, we support only those operations, but later we could add more.
+  if (dyn_cast<VectorType>(I->getType()) &&
+      (I->isBinaryOp() || Opcode == Instruction::SExt ||
+       Opcode == Instruction::ZExt || Opcode == Instruction::FPToUI ||
+       Opcode == Instruction::FPToSI || Opcode == Instruction::FPExt ||
+       Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+       Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc ||
+       Opcode == Instruction::BitCast))
+    return true;
+  return false;
+}
+
+bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) {
+  struct Operation {
+    // Position where the first operation, in the list of operations,
+    // was discovered and the last instruction in the current basic block.
+    unsigned Position;
+    InstrList Ops;
+  };
+  // The key is opertion opcode.
+  // The value is a list of operations with the first operation position in
+  // the basic block.
+  DenseMap<unsigned, Operation> Operations;
+  Instruction *LastInstr = BB.getTerminator();
+  unsigned CurrentPosition = 0;
+  for (BasicBlock::reverse_iterator IP(BB.rbegin()); IP != BB.rend();
+       *IP++, ++CurrentPosition) {
+    Instruction *I = &*IP;
+    unsigned OpFound = 0;
+
+    if (I->isDebugOrPseudoInst() || isDeleted(I) || !isOperationSupported(I))
+      continue;
+
+    unsigned Opcode = I->getOpcode();
+    if (Operations.contains(Opcode)) {
+      Operation *OpRec = &Operations[Opcode];
+      // If instructions are too apart then remove old instruction
+      // and reset position to the next instruction in the list instruction.
+      if (CurrentPosition - OpRec->Position > MaxInstDistance) {
+        unsigned NumToDelete = 0;
+        for (InstrList::iterator It = OpRec->Ops.begin();
+             It != OpRec->Ops.end(); ++It) {
+          Instruction *Instr = *It;
+          unsigned NewPosition =
+              std::distance(Instr->getIterator(), LastInstr->getIterator());
+          if (CurrentPosition - NewPosition > MaxInstDistance) {
+            NumToDelete++;
+          } else {
+            // Updating Position value to next remaining in range opertion.
+            OpRec->Position = NewPosition;
+            LLVM_DEBUG(dbgs() << "VW: Updating node starting with "
+                              << **(OpRec->Ops.begin())
+                              << " position to : " << NewPosition << "\n");
+            break;
+          }
+        }
+        for (unsigned i = 0; i < NumToDelete; ++i) {
+          LLVM_DEBUG(dbgs()
+                     << "VW: Deleting operation " << **(OpRec->Ops.begin())
+                     << " from node as out of range."
+                     << "\n");
+          OpRec->Ops.erase(OpRec->Ops.begin());
+        }
+      }
+      // If no operations left in the list, set position to the current.
+      if (!OpRec->Ops.size())
+        OpRec->Position = CurrentPosition;
+      OpRec->Ops.push_back(I);
+      LLVM_DEBUG(dbgs() << "VW: Found operation " << *I
+                        << " to add to existing node starting at "
+                        << **(OpRec->Ops.begin()) << " at : " << OpRec->Position
+                        << "\n");
+      if (OpRec->Ops.size() > 1)
+        OpFound = Opcode;
+    } else {
+      LLVM_DEBUG(dbgs() << "VW: Found operation " << *I
+                        << " to form a node at : " << CurrentPosition << "\n");
+      Operations[Opcode] = {CurrentPosition, {I}};
+    }
+
+    if (OpFound && Operations.contains(OpFound)) {
+      auto *OpRec = &Operations[OpFound];
+      for (Instruction *Op : OpRec->Ops)
+        LLVM_DEBUG(dbgs() << "VW: operation to check : " << *Op << "\n");
+      if (!widenNode(OpRec->Ops, Context)) {
+        LLVM_DEBUG(dbgs() << "VW: Unable use a wider vector for vector ops.\n");
+        if (OpRec->Ops.size() > 4) {
+          LLVM_DEBUG(dbgs() << "VW: Deleting operation "
+                            << **(OpRec->Ops.begin()) << " as unable to widen."
+                            << "\n");
+          OpRec->Ops.erase(OpRec->Ops.begin());
+          OpRec->Position = std::distance(
+              (*(OpRec->Ops.begin()))->getIterator(), LastInstr->getIterator());
+        }
+      } else {
+        for (Instruction *Instr : OpRec->Ops)
+          eraseInstruction(Instr);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool VectorWiden::run() {
+  bool Changed = false;
+  LLVMContext &Context = F.getContext();
+
+  LLVM_DEBUG(dbgs() << "VW: Function:" << F.getName() << "\n");
+  for (BasicBlock &BB : F) {
+    LLVM_DEBUG(dbgs() << "VW: BB:" << BB.getName() << "\n");
+
+    // If any transformation is done, then we have to start all over again,
+    // since we generate new instructions.
+    while (processBB(BB, Context))
+      Changed = true;
+  }
+
+  if (Changed)
+    for (auto *I : DeletedInstructions)
+      RecursivelyDeleteTriviallyDeadInstructions(I);
+
+  return Changed;
+}
+
+PreservedAnalyses VectorWidenPass::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+
+  VectorWiden VecWiden(F, TTI);
+
+  if (!VecWiden.run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/test/Transforms/VectorWiden/add.ll b/llvm/test/Transforms/VectorWiden/add.ll
new file mode 100644
index 0000000000000..05b2eeeb5a9c6
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/add.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S %s 2>&1 | FileCheck %s
+
+define void @add(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, ptr %ptr) {
+; CHECK-LABEL: define void @add(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <vscale x 8 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add <vscale x 4 x i32> %a, %c
+  %add4 = add <vscale x 4 x i32> %b, %c
+  store <vscale x 4 x i32> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
+  store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
+
+define void @add_ir_flags(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, ptr %ptr) {
+; CHECK-LABEL: define void @add_ir_flags(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw <vscale x 8 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nuw nsw <vscale x 4 x i32> %a, %c
+  %add4 = add nuw <vscale x 4 x i32> %b, %c
+  store <vscale x 4 x i32> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
+  store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/bad-dependace.ll b/llvm/test/Transforms/VectorWiden/bad-dependace.ll
new file mode 100644
index 0000000000000..9e901037dcd2f
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/bad-dependace.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s
+
+define void @fptrunc(<vscale x 4 x float> %a, <vscale x 4 x float> %b, ptr %ptr) {
+; CHECK-LABEL: @fptrunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <vscale x 4 x float> [[A:%.*]] to <vscale x 4 x half>
+; CHECK-NEXT:    [[EXTR:%.*]] = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[EXTEND:%.*]] = fpext <vscale x 1 x half> [[EXTR]] to <vscale x 1 x float>
+; CHECK-NEXT:    [[INS:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> [[B:%.*]], <vscale x 1 x float> [[EXTEND]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = fptrunc <vscale x 4 x float> [[INS]] to <vscale x 4 x half>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds half, ptr [[PTR:%.*]], i64 0
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP3]], ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds half, ptr [[TMP5]], i64 [[TMP2]]
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = fptrunc <vscale x 4 x float> %a to <vscale x 4 x half>
+  %extr = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> %3, i64 0)
+  %extend = fpext <vscale x 1 x half> %extr to <vscale x 1 x float>
+  %ins = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %b, <vscale x 1 x float> %extend, i64 0)
+  %4 = fptrunc <vscale x 4 x float> %ins to <vscale x 4 x half>
+  %5 = getelementptr inbounds half, ptr %ptr, i64 0
+  store <vscale x 4 x half> %3, ptr %5, align 2
+  %6 = getelementptr inbounds half, ptr %5, i64 %2
+  store <vscale x 4 x half> %4, ptr %6, align 2
+  ret void
+}
+
+define void @add(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %ptr) {
+; CHECK-LABEL: @add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD4:%.*]] = add <vscale x 4 x i32> [[ADD]], [[B]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[ADD]], ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[ADD4]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add <vscale x 4 x i32> %a, %b
+  %add4 = add <vscale x 4 x i32> %add, %b
+  store <vscale x 4 x i32> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
+  store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+declare <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half>, i64 immarg)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64 immarg)
diff --git a/llvm/test/Transforms/VectorWiden/bitcast.ll b/llvm/test/Transforms/VectorWiden/bitcast.ll
new file mode 100644
index 0000000000000..c40780653e981
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/bitcast.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+
+define void @bitcast1(<8 x i64> %a, <8 x i64> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @bitcast1(
+; CHECK-SAME: <8 x i64> [[A:%.*]], <8 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> undef, <8 x i64> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP0]], <8 x i64> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i64> [[TMP1]] to <32 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> [[TMP2]], i64 16)
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast <8 x i64> %a to <16 x i32>
+  %1 = bitcast <8 x i64> %b to <16 x i32>
+  store <16 x i32> %0, ptr %ptr, align 16
+  store <16 x i32> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @bitcast2(<4 x i64> %a, <4 x i64> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @bitcast2(
+; CHECK-SAME: <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> undef, <4 x i64> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP0]], <4 x i64> [[A]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> [[TMP2]], i64 32)
+; CHECK-NEXT:    store <32 x i8> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast <4 x i64> %a to <32 x i8>
+  %1 = bitcast <4 x i64> %b to <32 x i8>
+  store <32 x i8> %0, ptr %ptr, align 16
+  store <32 x i8> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @bitcast3(<32 x i8> %a, <32 x i8> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @bitcast3(
+; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <64 x i8> @llvm.vector.insert.v64i8.v32i8(<64 x i8> undef, <32 x i8> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <64 x i8> @llvm.vector.insert.v64i8.v32i8(<64 x i8> [[TMP0]], <32 x i8> [[A]], i64 32)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <64 x i8> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP2]], i64 4)
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast <32 x i8> %a to <4 x i64>
+  %1 = bitcast <32 x i8> %b to <4 x i64>
+  store <4 x i64> %0, ptr %ptr, align 16
+  store <4 x i64> %1, ptr %ptr1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/ext-trunc.ll b/llvm/test/Transforms/VectorWiden/ext-trunc.ll
new file mode 100644
index 0000000000000..27db8440a499c
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/ext-trunc.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+
+define void @sext(<8 x i8> %a, <8 x i8> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @sext(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> undef, <8 x i8> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = sext <8 x i8> %a to <8 x i64>
+  %1 = sext <8 x i8> %b to <8 x i64>
+  store <8 x i64> %0, ptr %ptr, align 16
+  store <8 x i64> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @zext(<8 x i8> %a, <8 x i8> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @zext(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> undef, <8 x i8> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = zext <8 x i8> %a to <8 x i64>
+  %1 = zext <8 x i8> %b to <8 x i64>
+  store <8 x i64> %0, ptr %ptr, align 16
+  store <8 x i64> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @trunc(
+; CHECK-SAME: <8 x i64> [[A:%.*]], <8 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> undef, <8 x i64> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP0]], <8 x i64> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i8> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i8> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i8>
+  %1 = trunc <8 x i64> %b to <8 x i8>
+  store <8 x i8> %0, ptr %ptr, align 16
+  store <8 x i8> %1, ptr %ptr1, align 16
+  ret void
+}
+
diff --git a/llvm/test/Transforms/VectorWiden/fadd.ll b/llvm/test/Transforms/VectorWiden/fadd.ll
new file mode 100644
index 0000000000000..01070ba824c3e
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fadd.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+define void @add(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, ptr %ptr) {
+; CHECK-LABEL: define void @add(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP0]], <vscale x 4 x float> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 8 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x float>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = fadd <vscale x 4 x float> %a, %c
+  %add4 = fadd <vscale x 4 x float> %b, %c
+  store <vscale x 4 x float> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x float>, ptr %ptr, i64 1
+  store <vscale x 4 x float> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
+
+define void @add_ir_flags(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, ptr %ptr) {
+; CHECK-LABEL: define void @add_ir_flags(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP0]], <vscale x 4 x float> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd nnan ninf <vscale x 8 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x float>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = fadd fast nnan <vscale x 4 x float> %a, %c
+  %add4 = fadd nnan ninf <vscale x 4 x float> %b, %c
+  store <vscale x 4 x float> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x float>, ptr %ptr, i64 1
+  store <vscale x 4 x float> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/fp-ext.ll b/llvm/test/Transforms/VectorWiden/fp-ext.ll
new file mode 100644
index 0000000000000..a4fadd8dbaabf
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fp-ext.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+
+define void @fp_ext(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @fp_ext(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x double> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x double> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = fpext <8 x half> %a to <8 x double>
+  %1 = fpext <8 x half> %b to <8 x double>
+  store <8 x double> %0, ptr %ptr, align 16
+  store <8 x double> %1, ptr %ptr1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/fp-int.ll b/llvm/test/Transforms/VectorWiden/fp-int.ll
new file mode 100644
index 0000000000000..1e1ebd2939b36
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fp-int.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+define void @fptosi(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @fptosi(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <16 x half> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = fptosi <8 x half> %a to <8 x i16>
+  %1 = fptosi <8 x half> %b to <8 x i16>
+  store <8 x i16> %0, ptr %ptr, align 16
+  store <8 x i16> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @sitofp(<8 x i16> %a, <8 x i16> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @sitofp(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> undef, <8 x i16> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP0]], <8 x i16> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x half>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = sitofp <8 x i16> %a to <8 x half>
+  %1 = sitofp <8 x i16> %b to <8 x half>
+  store <8 x half> %0, ptr %ptr, align 16
+  store <8 x half> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @fptoui(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @fptoui(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <16 x half> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = fptoui <8 x half> %a to <8 x i16>
+  %1 = fptoui <8 x half> %b to <8 x i16>
+  store <8 x i16> %0, ptr %ptr, align 16
+  store <8 x i16> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @uitofp(<8 x i16> %a, <8 x i16> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @uitofp(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> undef, <8 x i16> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP0]], <8 x i16> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x half>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = uitofp <8 x i16> %a to <8 x half>
+  %1 = uitofp <8 x i16> %b to <8 x half>
+  store <8 x half> %0, ptr %ptr, align 16
+  store <8 x half> %1, ptr %ptr1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/fptrunc.ll b/llvm/test/Transforms/VectorWiden/fptrunc.ll
new file mode 100644
index 0000000000000..4c19abf852ead
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fptrunc.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s
+
+define void @fptrunc(<vscale x 4 x float> %a, <vscale x 4 x float> %b, ptr %ptr) {
+; CHECK-LABEL: @fptrunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[B:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[A:%.*]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <vscale x 8 x float> [[TMP4]] to <vscale x 8 x half>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP5]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds half, ptr [[PTR:%.*]], i64 0
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP7]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds half, ptr [[TMP8]], i64 [[TMP2]]
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP6]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = fptrunc <vscale x 4 x float> %a to <vscale x 4 x half>
+  %4 = fptrunc <vscale x 4 x float> %b to <vscale x 4 x half>
+  %5 = getelementptr inbounds half, ptr %ptr, i64 0
+  store <vscale x 4 x half> %3, ptr %5, align 2
+  %6 = getelementptr inbounds half, ptr %5, i64 %2
+  store <vscale x 4 x half> %4, ptr %6, align 2
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/VectorWiden/widen-distance.ll b/llvm/test/Transforms/VectorWiden/widen-distance.ll
new file mode 100644
index 0000000000000..bc6d0682f8475
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/widen-distance.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=vector-widen -S -vw-override-target-consider-to-widen=1 -vw-max-instr-distance=2 < %s | FileCheck %s
+
+define <4 x i32> @foo(float %a0, float %a1, float %a2, float %a3, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTR1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> undef, <4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP3]], <4 x float> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP5]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = fptosi <4 x float> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[PTR1]], align 2
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[PTR2:%.*]], align 2
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = load <4 x float>, ptr %ptr1
+  %2 = fptosi <4 x float> %1 to <4 x i32>
+  %3 = load <4 x float>, ptr %ptr1
+  %4 = fptosi <4 x float> %3 to <4 x i32>
+  %5 = insertelement <4 x float> poison, float %a0, i32 0
+  %6 = insertelement <4 x float> %5, float %a1, i32 1
+  %7 = insertelement <4 x float> %6, float %a2, i32 2
+  %8 = insertelement <4 x float> %7, float %a3, i32 3
+  %9 = fptosi <4 x float> %8 to <4 x i32>
+  store <4 x i32> %4, ptr %ptr1, align 2
+  store <4 x i32> %9, ptr %ptr2, align 2
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @bar(<4 x float> %a0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[PTR1]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> undef, <4 x float> [[A0:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP4]], <4 x float> [[TMP3]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = fptosi <8 x float> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP6]], i64 4)
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[PTR1]], align 2
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[PTR2:%.*]], align 2
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+
+  %1 = load <4 x float>, ptr %ptr1
+  %2 = fptosi <4 x float> %1 to <4 x i32>
+  %3 = load <4 x float>, ptr %ptr1
+  %4 = fptosi <4 x float> %3 to <4 x i32>
+  %5 = fptosi <4 x float> %a0 to <4 x i32>
+  store <4 x i32> %4, ptr %ptr1, align 2
+  store <4 x i32> %5, ptr %ptr2, align 2
+  ret <4 x i32> %2
+}
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index ca67426e08699..f5ef7bbd7106a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -19,5 +19,6 @@ static_library("Vectorize") {
     "VPlanVerifier.cpp",
     "VectorCombine.cpp",
     "Vectorize.cpp",
+    "VectorWiden.cpp",
   ]
 }