diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 20ba51ade35a7..2c945d2399b25 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -14,7 +14,7 @@
 //
 // All Hexagon intrinsics start with "llvm.hexagon.".
 let TargetPrefix = "hexagon" in {
-  /// Hexagon_Intrinsic - Base class for the majority of Hexagon intrinsics.
+  /// Hexagon_Intrinsic - Base class for majority of Hexagon intrinsics.
   class Hexagon_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
                               list<LLVMType> param_types,
                               list<IntrinsicProperty> properties>
@@ -435,6 +435,84 @@ def int_hexagon_V6_vmaskedstorenq_128B: Hexagon_custom_vms_Intrinsic_128B;
 def int_hexagon_V6_vmaskedstorentq_128B: Hexagon_custom_vms_Intrinsic_128B;
 def int_hexagon_V6_vmaskedstorentnq_128B: Hexagon_custom_vms_Intrinsic_128B;
 
+// Carryo
+// The script can't autogenerate clang builtins for vaddcarryo/vsubarryo,
+// and they are marked in HexagonIset.py as not having intrinsics at all.
+// The script could generate intrinsics, but instead of doing intrinsics
+// without builtins, just put the intrinsics here.
+
+// tag : V6_vaddcarryo
+class Hexagon_custom_v16i32v64i1_v16i32v16i32_Intrinsic<
+      list<IntrinsicProperty> intr_properties = [IntrNoMem]>
+  : Hexagon_NonGCC_Intrinsic<
+       [llvm_v16i32_ty,llvm_v64i1_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
+       intr_properties>;
+
+// tag : V6_vaddcarryo
+class Hexagon_custom_v32i32v128i1_v32i32v32i32_Intrinsic_128B<
+      list<IntrinsicProperty> intr_properties = [IntrNoMem]>
+  : Hexagon_NonGCC_Intrinsic<
+       [llvm_v32i32_ty,llvm_v128i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
+       intr_properties>;
+
+// Pseudo intrinsics for widening vector isntructions that
+// get replaced with the real Hexagon instructions during
+// instruction lowering.
+class Hexagon_widenvec_Intrinsic
+  : Hexagon_NonGCC_Intrinsic<
+       [llvm_anyvector_ty],
+       [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
+       [IntrNoMem]>;
+
+class Hexagon_non_widenvec_Intrinsic
+  : Hexagon_NonGCC_Intrinsic<
+       [llvm_anyvector_ty],
+       [LLVMMatchType<0>, LLVMMatchType<0>],
+       [IntrNoMem]>;
+
+// Widening vector add
+def int_hexagon_vadd_su: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vadd_uu: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vadd_ss: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vadd_us: Hexagon_widenvec_Intrinsic;
+
+
+// Widening vector subtract
+def int_hexagon_vsub_su: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vsub_uu: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vsub_ss: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vsub_us: Hexagon_widenvec_Intrinsic;
+
+// Widening vector multiply
+def int_hexagon_vmpy_su: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vmpy_uu: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vmpy_ss: Hexagon_widenvec_Intrinsic;
+def int_hexagon_vmpy_us: Hexagon_widenvec_Intrinsic;
+
+def int_hexagon_vavgu: Hexagon_non_widenvec_Intrinsic;
+def int_hexagon_vavgs: Hexagon_non_widenvec_Intrinsic;
+
+class Hexagon_vasr_Intrinsic
+  : Hexagon_NonGCC_Intrinsic<
+       [LLVMSubdivide2VectorType<0>],
+       [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty],
+       [IntrNoMem]>;
+
+def int_hexagon_vasrsat_su: Hexagon_vasr_Intrinsic;
+def int_hexagon_vasrsat_uu: Hexagon_vasr_Intrinsic;
+def int_hexagon_vasrsat_ss: Hexagon_vasr_Intrinsic;
+
+class Hexagon_widen_vec_scalar_Intrinsic
+  : Hexagon_NonGCC_Intrinsic<
+       [llvm_anyvector_ty],
+       [LLVMTruncatedType<0>, llvm_i32_ty],
+       [IntrNoMem]>;
+
+// Widening vector scalar multiply
+def int_hexagon_vmpy_ub_b: Hexagon_widen_vec_scalar_Intrinsic;
+def int_hexagon_vmpy_ub_ub: Hexagon_widen_vec_scalar_Intrinsic;
+def int_hexagon_vmpy_uh_uh: Hexagon_widen_vec_scalar_Intrinsic;
+def int_hexagon_vmpy_h_h: Hexagon_widen_vec_scalar_Intrinsic;
 
 // Intrinsic for instrumentation based profiling using a custom handler. The
 // name of the handler is passed as the first operand to the intrinsic. The
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
index dde4132791f06..2a673603e4e03 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
@@ -491,20 +491,6 @@ class Hexagon_custom_v32i32v128i1_v32i32v32i32v128i1_Intrinsic_128B<
        [llvm_v32i32_ty,llvm_v128i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v128i1_ty],
        intr_properties>;
 
-// tag : V6_vaddcarryo
-class Hexagon_custom_v16i32v64i1_v16i32v16i32_Intrinsic<
-      list<IntrinsicProperty> intr_properties = [IntrNoMem]>
-  : Hexagon_NonGCC_Intrinsic<
-       [llvm_v16i32_ty,llvm_v64i1_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
-       intr_properties>;
-
-// tag : V6_vaddcarryo
-class Hexagon_custom_v32i32v128i1_v32i32v32i32_Intrinsic_128B<
-      list<IntrinsicProperty> intr_properties = [IntrNoMem]>
-  : Hexagon_NonGCC_Intrinsic<
-       [llvm_v32i32_ty,llvm_v128i1_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
-       intr_properties>;
-
 // tag : V6_vaddcarrysat
 class Hexagon_v16i32_v16i32v16i32v64i1_Intrinsic<string GCCIntSuffix,
       list<IntrinsicProperty> intr_properties = [IntrNoMem]>
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 1a5f09642ea66..eddab5a235dab 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -37,6 +37,8 @@ add_llvm_target(HexagonCodeGen
   HexagonGenMemAbsolute.cpp
   HexagonGenMux.cpp
   HexagonGenPredicate.cpp
+  HexagonGenWideningVecFloatInstr.cpp
+  HexagonGenWideningVecInstr.cpp
   HexagonHardwareLoops.cpp
   HexagonHazardRecognizer.cpp
   HexagonInstrInfo.cpp
@@ -53,6 +55,7 @@ add_llvm_target(HexagonCodeGen
   HexagonNewValueJump.cpp
   HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
+  HexagonOptShuffleVector.cpp
   HexagonPeephole.cpp
   HexagonQFPOptimizer.cpp
   HexagonRDFOpt.cpp
diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h
index 422ab20891b94..b98369d1b3e30 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/llvm/lib/Target/Hexagon/Hexagon.h
@@ -92,6 +92,9 @@ FunctionPass *createHexagonGenInsert();
 FunctionPass *createHexagonGenMemAbsolute();
 FunctionPass *createHexagonGenMux();
 FunctionPass *createHexagonGenPredicate();
+FunctionPass *
+createHexagonGenWideningVecFloatInstr(const HexagonTargetMachine &);
+FunctionPass *createHexagonGenWideningVecInstr(const HexagonTargetMachine &);
 FunctionPass *createHexagonHardwareLoops();
 FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
                                    CodeGenOptLevel OptLevel);
@@ -102,6 +105,7 @@ FunctionPass *createHexagonMergeActivateWeight();
 FunctionPass *createHexagonNewValueJump();
 FunctionPass *createHexagonOptAddrMode();
 FunctionPass *createHexagonOptimizeSZextends();
+FunctionPass *createHexagonOptShuffleVector(const HexagonTargetMachine &);
 FunctionPass *createHexagonPacketizer(bool Minimal);
 FunctionPass *createHexagonPeephole();
 FunctionPass *createHexagonRDFOpt();
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 5344ed8446efc..412d58743df94 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -51,8 +51,7 @@ struct PrintRegister {
 };
 
 [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
-                                         const PrintRegister &PR);
-raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &PR) {
+                                         const PrintRegister &PR) {
   return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg);
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenWideningVecFloatInstr.cpp b/llvm/lib/Target/Hexagon/HexagonGenWideningVecFloatInstr.cpp
new file mode 100644
index 0000000000000..7271f1f839d69
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonGenWideningVecFloatInstr.cpp
@@ -0,0 +1,565 @@
+//===------------------- HexagonGenWideningVecFloatInstr.cpp --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace widening vector float operations with hexagon intrinsics.
+//
+//===----------------------------------------------------------------------===//
+//
+// Brief overview of working of GenWideningVecFloatInstr pass.
+// This version of pass is replica of already existing pass(which will replace
+// widen vector integer operations with it's respective intrinsics). In this
+// pass we will generate hexagon intrinsics for widen vector float instructions.
+//
+// Example1(64 vector-width widening):
+// %wide.load = load <64 x half>, <64 x half>* %0, align 2
+// %wide.load53 = load <64 x half>, <64 x half>* %2, align 2
+// %1 = fpext <64 x half> %wide.load to <64 x float>
+// %3 = fpext <64 x half> %wide.load53 to <64 x float>
+// %4 = fmul <64 x float> %1, %3
+//
+// If we run this pass on the above example, it will first find fmul
+// instruction, and then it will check whether the operands of fmul instruction
+// (%1 and %3) belongs to either of these categories [%1 ->fpext, %3 ->fpext]
+// or [%1 ->fpext, %3 ->constant_vector] or [%1 ->constant_vector, %3 ->fpext].
+// If it sees such pattern, then this pass will replace such pattern with
+// appropriate hexagon intrinsics.
+//
+// After replacement:
+// %wide.load = load <64 x half>, <64 x half>* %0, align 2
+// %wide.load53 = load <64 x half>, <64 x half>* %2, align 2
+// %3 = bitcast <64 x half> %wide.load to <32 x i32>
+// %4 = bitcast <64 x half> %wide.load53 to <32 x i32>
+// %5 = call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%3, %4)
+// %6 = shufflevector <64 x i32> %5, <64 x i32> poison, <64 x i32> ShuffMask1
+// %7 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %6)
+// %8 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %6)
+// %9 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %7)
+// %10 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %8)
+// %11 = bitcast <32 x i32> %9 to <32 x float>
+// %12 = bitcast <32 x i32> %10 to <32 x float>
+// %13 = shufflevector <32 x float> %12, <32 x float> %11, <64 x i32> ShuffMask2
+//
+//
+//
+// Example2(128 vector-width widening):
+// %0 = bitcast half* %a to <128 x half>*
+// %wide.load = load <128 x half>, <128 x half>* %0, align 2
+// %1 = fpext <128 x half> %wide.load to <128 x float>
+// %2 = bitcast half* %b to <128 x half>*
+// %wide.load2 = load <128 x half>, <128 x half>* %2, align 2
+// %3 = fpext <128 x half> %wide.load2 to <128 x float>
+// %4 = fmul <128 x float> %1, %3
+//
+// After replacement:
+// %0 = bitcast half* %a to <128 x half>*
+// %wide.load = load <128 x half>, <128 x half>* %0, align 2
+// %1 = bitcast half* %b to <128 x half>*
+// %wide.load2 = load <128 x half>, <128 x half>* %1, align 2
+// %2 = bitcast <128 x half> %wide.load to <64 x i32>
+// %3 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %2)
+// %4 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %2)
+// %5 = bitcast <128 x half> %wide.load2 to <64 x i32>
+// %6 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %5)
+// %7 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %5)
+// %8 = call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%3, %6)
+// %9 = shufflevector <64 x i32> %8, <64 x i32> poison, <64 x i32> Mask1
+// %10 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %9)
+// %11 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %9)
+// %12 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %10)
+// %13 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %11)
+// %14 = bitcast <32 x i32> %12 to <32 x float>
+// %15 = bitcast <32 x i32> %13 to <32 x float>
+// %16 = shufflevector <32 x float> %15, <32 x float> %14, <64 x i32> Mask2
+// %17 = call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%4, %7)
+// %18 = shufflevector <64 x i32> %17, <64 x i32> poison, <64 x i32> Mask1
+// %19 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %18)
+// %20 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %18)
+// %21 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %19)
+// %22 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %20)
+// %23 = bitcast <32 x i32> %21 to <32 x float>
+// %24 = bitcast <32 x i32> %22 to <32 x float>
+// %25 = shufflevector <32 x float> %24, <32 x float> %23, <64 x i32> Mask2
+// %26 = shufflevector <64 x float> %25, <64 x float> %16, <128 x i32> Mask3
+//
+//
+//===----------------------------------------------------------------------===//
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+namespace llvm {
+void initializeHexagonGenWideningVecFloatInstrPass(PassRegistry &);
+FunctionPass *
+createHexagonGenWideningVecFloatInstr(const HexagonTargetMachine &);
+} // end namespace llvm
+
+namespace {
+
+class HexagonGenWideningVecFloatInstr : public FunctionPass {
+public:
+  static char ID;
+
+  HexagonGenWideningVecFloatInstr() : FunctionPass(ID) {
+    initializeHexagonGenWideningVecFloatInstrPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  HexagonGenWideningVecFloatInstr(const HexagonTargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {
+    initializeHexagonGenWideningVecFloatInstrPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon generate widening vector float instructions";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  Module *M = nullptr;
+  const HexagonTargetMachine *TM = nullptr;
+  const HexagonSubtarget *HST = nullptr;
+  unsigned HwVLen;
+  unsigned NumHalfEltsInFullVec;
+
+  struct OPInfo {
+    Value *OP;
+    Value *ExtInOP;
+    unsigned ExtInSize;
+  };
+
+  bool visitBlock(BasicBlock *B);
+  bool processInstruction(Instruction *Inst);
+  bool replaceWithIntrinsic(Instruction *Inst, OPInfo &OP1Info,
+                            OPInfo &OP2Info);
+
+  bool getOperandInfo(Value *V, OPInfo &OPI);
+  bool isExtendedConstant(Constant *C);
+  unsigned getElementSizeInBits(Value *V);
+  Type *getElementTy(unsigned size, IRBuilder<> &IRB);
+
+  Value *adjustExtensionForOp(OPInfo &OPI, IRBuilder<> &IRB,
+                              unsigned NewEltsize, unsigned NumElts);
+
+  std::pair<Value *, Value *> opSplit(Value *OP, Instruction *Inst);
+
+  Value *createIntrinsic(Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1,
+                         Value *NewOP2, FixedVectorType *ResType,
+                         unsigned NumElts, bool BitCastOp);
+};
+
+} // end anonymous namespace
+
+char HexagonGenWideningVecFloatInstr::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonGenWideningVecFloatInstr, "widening-vec-float",
+                      "Hexagon generate "
+                      "widening vector float instructions",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenWideningVecFloatInstr, "widening-vec-float",
+                    "Hexagon generate "
+                    "widening vector float instructions",
+                    false, false)
+
+bool HexagonGenWideningVecFloatInstr::isExtendedConstant(Constant *C) {
+  if (Value *SplatV = C->getSplatValue()) {
+    if (auto *CFP = dyn_cast<ConstantFP>(SplatV)) {
+      bool Ignored;
+      APFloat APF = CFP->getValueAPF();
+      APFloat::opStatus sts = APF.convert(
+          APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+      if (sts == APFloat::opStatus::opOK || sts == APFloat::opStatus::opInexact)
+        return true;
+    }
+    return false;
+  }
+  unsigned NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    if (auto *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(i))) {
+      bool Ignored;
+      APFloat APF = CFP->getValueAPF();
+      APFloat::opStatus sts = APF.convert(
+          APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+      if (sts != APFloat::opStatus::opOK && sts != APFloat::opStatus::opInexact)
+        return false;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+unsigned HexagonGenWideningVecFloatInstr::getElementSizeInBits(Value *V) {
+  Type *ValTy = V->getType();
+  Type *EltTy = ValTy;
+  if (dyn_cast<Constant>(V)) {
+    unsigned EltSize =
+        cast<VectorType>(EltTy)->getElementType()->getPrimitiveSizeInBits();
+    unsigned ReducedSize = EltSize / 2;
+
+    return ReducedSize;
+  }
+
+  if (ValTy->isVectorTy())
+    EltTy = cast<VectorType>(ValTy)->getElementType();
+  return EltTy->getPrimitiveSizeInBits();
+}
+
+bool HexagonGenWideningVecFloatInstr::getOperandInfo(Value *V, OPInfo &OPI) {
+  using namespace PatternMatch;
+  OPI.OP = V;
+  Value *ExtV = nullptr;
+  Constant *C = nullptr;
+
+  if (match(V, (m_FPExt(m_Value(ExtV)))) ||
+      match(V,
+            m_Shuffle(m_InsertElt(m_Poison(), m_FPExt(m_Value(ExtV)), m_Zero()),
+                      m_Poison(), m_ZeroMask()))) {
+
+    if (auto *ExtVType = dyn_cast<VectorType>(ExtV->getType())) {
+      // Matches the first branch.
+      if (ExtVType->getElementType()->isBFloatTy())
+        // do not confuse bf16 with ieee-fp16.
+        return false;
+    } else {
+      // Matches the second branch (insert element branch)
+      if (ExtV->getType()->isBFloatTy())
+        return false;
+    }
+
+    OPI.ExtInOP = ExtV;
+    OPI.ExtInSize = getElementSizeInBits(OPI.ExtInOP);
+    return true;
+  }
+
+  if (match(V, m_Constant(C))) {
+    if (!isExtendedConstant(C))
+      return false;
+    OPI.ExtInOP = C;
+    OPI.ExtInSize = getElementSizeInBits(OPI.ExtInOP);
+    return true;
+  }
+
+  return false;
+}
+
+Type *HexagonGenWideningVecFloatInstr::getElementTy(unsigned size,
+                                                    IRBuilder<> &IRB) {
+  switch (size) {
+  case 16:
+    return IRB.getHalfTy();
+  case 32:
+    return IRB.getFloatTy();
+  default:
+    llvm_unreachable("Unhandled Element size");
+  }
+}
+
+Value *HexagonGenWideningVecFloatInstr::adjustExtensionForOp(
+    OPInfo &OPI, IRBuilder<> &IRB, unsigned NewExtSize, unsigned NumElts) {
+  Value *V = OPI.ExtInOP;
+  unsigned EltSize = getElementSizeInBits(OPI.ExtInOP);
+  assert(NewExtSize >= EltSize);
+  Type *EltType = getElementTy(NewExtSize, IRB);
+  auto *NewOpTy = FixedVectorType::get(EltType, NumElts);
+
+  if (auto *C = dyn_cast<Constant>(V))
+    return IRB.CreateFPTrunc(C, NewOpTy);
+
+  if (V->getType()->isVectorTy())
+    if (NewExtSize == EltSize)
+      return V;
+
+  return nullptr;
+}
+
+std::pair<Value *, Value *>
+HexagonGenWideningVecFloatInstr::opSplit(Value *OP, Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  IRBuilder<> IRB(Inst);
+  Intrinsic::ID IntHi = Intrinsic::hexagon_V6_hi_128B;
+  Intrinsic::ID IntLo = Intrinsic::hexagon_V6_lo_128B;
+  Function *ExtFHi = Intrinsic::getOrInsertDeclaration(M, IntHi);
+  Function *ExtFLo = Intrinsic::getOrInsertDeclaration(M, IntLo);
+  if (NumElts == 128) {
+    auto *InType = FixedVectorType::get(IRB.getInt32Ty(), 64);
+    OP = IRB.CreateBitCast(OP, InType);
+  }
+  Value *OP1Hi = IRB.CreateCall(ExtFHi, {OP});
+  Value *OP1Lo = IRB.CreateCall(ExtFLo, {OP});
+  return std::pair<Value *, Value *>(OP1Hi, OP1Lo);
+}
+
+Value *HexagonGenWideningVecFloatInstr::createIntrinsic(
+    Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1, Value *NewOP2,
+    FixedVectorType *ResType, unsigned NumElts, bool BitCastOp) {
+
+  IRBuilder<> IRB(Inst);
+  Function *ExtF = Intrinsic::getOrInsertDeclaration(M, IntId);
+  Function *ConvF = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::hexagon_V6_vconv_sf_qf32_128B);
+  auto *InType = FixedVectorType::get(IRB.getInt32Ty(), 32);
+  auto *RType = FixedVectorType::get(IRB.getFloatTy(), 32);
+
+  // Make sure inputs to vmpy instrinsic are full vectors
+  if (NumElts == NumHalfEltsInFullVec / 2) {
+    SmallVector<Constant *, 16> ConcatMask1;
+    for (unsigned i = 0; i < NumHalfEltsInFullVec; ++i)
+      ConcatMask1.push_back(IRB.getInt32(i));
+    NewOP1 =
+        IRB.CreateShuffleVector(NewOP1, PoisonValue::get(NewOP1->getType()),
+                                ConstantVector::get(ConcatMask1));
+    NewOP2 =
+        IRB.CreateShuffleVector(NewOP2, PoisonValue::get(NewOP2->getType()),
+                                ConstantVector::get(ConcatMask1));
+  }
+
+  if (BitCastOp) {
+    NewOP1 = IRB.CreateBitCast(NewOP1, InType);
+    NewOP2 = IRB.CreateBitCast(NewOP2, InType);
+  }
+
+  Value *NewIn = IRB.CreateCall(ExtF, {NewOP1, NewOP2});
+  // Interleave the output elements to ensure correct order in Hi and Lo vectors
+  // Shuffled Mask: [0, 32, 1, 33, ..., 31, 63]
+  // Hi: [0, 1, ..., 31] and Lo: [32, 33, ..., 63]
+  SmallVector<Constant *, 16> Mask;
+  unsigned HalfVecPoint = NumHalfEltsInFullVec / 2;
+  for (unsigned i = 0; i < HalfVecPoint; ++i) {
+    Mask.push_back(IRB.getInt32(i));
+    Mask.push_back(IRB.getInt32(HalfVecPoint + i));
+  }
+  NewIn = IRB.CreateShuffleVector(NewIn, PoisonValue::get(NewIn->getType()),
+                                  ConstantVector::get(Mask));
+
+  std::pair<Value *, Value *> SplitOP = opSplit(NewIn, Inst);
+  Value *ConvHi = IRB.CreateCall(ConvF, {SplitOP.first});
+  ConvHi = IRB.CreateBitCast(ConvHi, RType);
+
+  if (ResType->getNumElements() == NumHalfEltsInFullVec / 2) {
+    return ConvHi;
+  }
+
+  Value *ConvLo = IRB.CreateCall(ConvF, {SplitOP.second});
+  ConvLo = IRB.CreateBitCast(ConvLo, RType);
+
+  SmallVector<Constant *, 16> ShuffleMask;
+  for (unsigned i = 0; i < NumElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(i));
+  // Concat Hi and Lo.
+  NewIn =
+      IRB.CreateShuffleVector(ConvLo, ConvHi, ConstantVector::get(ShuffleMask));
+  return NewIn;
+}
+
+bool HexagonGenWideningVecFloatInstr::replaceWithIntrinsic(Instruction *Inst,
+                                                           OPInfo &OP1Info,
+                                                           OPInfo &OP2Info) {
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  [[maybe_unused]] unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  unsigned MaxEltSize = OP1Info.ExtInSize;
+  unsigned NewOpEltSize = MaxEltSize;
+  unsigned NewResEltSize = 2 * MaxEltSize;
+
+  unsigned ResVLen = NewResEltSize * NumElts;
+  if (NewOpEltSize > 16 || ((ResVLen > HwVLen) && (ResVLen % HwVLen) != 0))
+    return false;
+
+  Intrinsic::ID IntId = Intrinsic::hexagon_V6_vmpy_qf32_hf_128B;
+  IRBuilder<> IRB(Inst);
+  Value *NewOP1 = adjustExtensionForOp(OP1Info, IRB, NewOpEltSize, NumElts);
+  Value *NewOP2 = adjustExtensionForOp(OP2Info, IRB, NewOpEltSize, NumElts);
+
+  if (NewOP1 == nullptr || NewOP2 == nullptr)
+    return false;
+
+  if (ResVLen > 2 * HwVLen) {
+    // The code written in this if block generates the widening code when
+    // vector-width is 128:
+    //
+    // Step 1: Bitcast <128 x half> type to <64 x i32>
+    // %wide.load = load <128 x half>, <128 x half>* %0 is bitcasted to,
+    // bitcast <128 x half> %wide.load to <64 x i32>
+    //
+    // Step 2: Generate Hi and Lo vectors
+    // call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %4)
+    // call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %4)
+    //
+    // Perform above 2 steps for both the operands of fmul instruction
+    //
+    // Step 3: Generate vmpy_qf32_hf multiply instruction to multiply two Hi
+    // vectors from both operands.
+    // call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%5, %8)
+    //
+    // Step 4: Convert the resultant 'qf32' output to 'sf' format
+    // %11 = shufflevector <64 x i32> %10, <64 x i32> poison, <64 x i32> Mask1
+    // %12 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %11)
+    // %13 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %11)
+    // call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %12)
+    // call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %13)
+    //
+    // Repeat steps 3 and 4 for mutiplication and conversion of Lo vectors.
+    // Finally merge the output values in correct sequence using shuffle
+    // vectors.
+
+    assert(ResVLen == 4 * HwVLen);
+    // Split the operands
+    unsigned HalfElts = NumElts / 2;
+    std::pair<Value *, Value *> SplitOP1 = opSplit(NewOP1, Inst);
+    std::pair<Value *, Value *> SplitOP2 = opSplit(NewOP2, Inst);
+    auto *castResType = FixedVectorType::get(IRB.getInt32Ty(), HalfElts);
+    Value *NewInHi =
+        createIntrinsic(IntId, Inst, SplitOP1.first, SplitOP2.first,
+                        castResType, HalfElts, false);
+    Value *NewInLo =
+        createIntrinsic(IntId, Inst, SplitOP1.second, SplitOP2.second,
+                        castResType, HalfElts, false);
+    assert(InstEltSize == NewResEltSize);
+    SmallVector<Constant *, 8> ShuffleMask;
+    for (unsigned i = 0; i < NumElts; ++i)
+      ShuffleMask.push_back(IRB.getInt32(i));
+    // Concat Hi and Lo.
+    Value *NewIn = IRB.CreateShuffleVector(NewInLo, NewInHi,
+                                           ConstantVector::get(ShuffleMask));
+
+    Inst->replaceAllUsesWith(NewIn);
+    return true;
+  }
+
+  auto *ResType =
+      FixedVectorType::get(getElementTy(NewResEltSize, IRB), NumElts);
+
+  // The following widening code can only be generated in cases where
+  // input vectors are 64xhalf/32xhalf and the results are 64xfloat/32xfloat
+  // respectively.
+  if (!(NumElts == NumHalfEltsInFullVec &&
+        ResType->getNumElements() == NumHalfEltsInFullVec) &&
+      !(NumElts == NumHalfEltsInFullVec / 2 &&
+        ResType->getNumElements() == NumHalfEltsInFullVec / 2))
+    return false;
+  Value *NewIn =
+      createIntrinsic(IntId, Inst, NewOP1, NewOP2, ResType, NumElts, true);
+
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+// Process instruction and replace them with widening vector
+// intrinsics if possible.
+bool HexagonGenWideningVecFloatInstr::processInstruction(Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  if (!InstTy->isVectorTy() ||
+      cast<FixedVectorType>(InstTy)->getNumElements() > 128)
+    return false;
+  unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+  if (!HST->isTypeForHVX(cast<VectorType>(InstTy)) && InstLen != 4 * HwVLen)
+    return false;
+  if (InstLen < HwVLen)
+    return false;
+
+  using namespace PatternMatch;
+
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  OPInfo OP1Info, OP2Info;
+
+  // Handle the case when Inst = fpext(fmul<64xhalf>(op1, op2)). The Inst can
+  // be replaced with widening multiply.
+  if (match(Inst, (m_FPExt((m_FMul(m_Value(OP1), m_Value(OP2))))))) {
+    OP1Info.ExtInOP = OP1;
+    OP1Info.ExtInSize = getElementSizeInBits(OP1);
+    OP2Info.ExtInOP = OP2;
+    OP2Info.ExtInSize = getElementSizeInBits(OP2);
+
+    if (auto *Op1Vtype = dyn_cast<VectorType>(OP1->getType())) {
+      if (!Op1Vtype->getElementType()->isHalfTy()) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+
+    if (OP1Info.ExtInSize == OP2Info.ExtInSize && OP1Info.ExtInSize == 16 &&
+        getElementSizeInBits(Inst) == 32) {
+      return replaceWithIntrinsic(Inst, OP1Info, OP2Info);
+    }
+  }
+
+  if (!match(Inst, (m_FMul(m_Value(OP1), m_Value(OP2)))))
+    return false;
+
+  if (!getOperandInfo(OP1, OP1Info) || !getOperandInfo(OP2, OP2Info))
+    return false;
+
+  if (!OP1Info.ExtInOP || !OP2Info.ExtInOP)
+    return false;
+
+  if (OP1Info.ExtInSize == OP2Info.ExtInSize && OP1Info.ExtInSize == 16) {
+    return replaceWithIntrinsic(Inst, OP1Info, OP2Info);
+  }
+
+  return false;
+}
+
+bool HexagonGenWideningVecFloatInstr::visitBlock(BasicBlock *B) {
+  bool Changed = false;
+  for (auto &I : *B)
+    Changed |= processInstruction(&I);
+  return Changed;
+}
+
+bool HexagonGenWideningVecFloatInstr::runOnFunction(Function &F) {
+  M = F.getParent();
+  HST = TM->getSubtargetImpl(F);
+
+  // Return if useHVX128BOps is not set. It can be enabled for 64B mode
+  // but wil require some changes. For example, bitcast for intrinsics
+  // assumes 128B mode.
+  if (skipFunction(F) || !HST->useHVX128BOps())
+    return false;
+
+  unsigned VecLength = HST->getVectorLength(); // Vector Length in Bytes
+  HwVLen = HST->getVectorLength() * 8;         // Vector Length in bits
+  NumHalfEltsInFullVec =
+      VecLength /
+      2; // Number of half (2B) elements that fit into a full HVX vector
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= visitBlock(&B);
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createHexagonGenWideningVecFloatInstr(const HexagonTargetMachine &TM) {
+  return new HexagonGenWideningVecFloatInstr(&TM);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonGenWideningVecInstr.cpp b/llvm/lib/Target/Hexagon/HexagonGenWideningVecInstr.cpp
new file mode 100644
index 0000000000000..8df22ae6ebb06
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonGenWideningVecInstr.cpp
@@ -0,0 +1,1184 @@
+//===--------------------- HexagonGenWideningVecInstr.cpp -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace widening vector operations with hexagon intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+// A command line argument to enable the generation of widening instructions
+// for short-vectors.
+static cl::opt<bool> WidenShortVector(
+    "hexagon-widen-short-vector",
+    cl::desc("Generate widening instructions for short vectors."), cl::Hidden);
+
+namespace llvm {
+void initializeHexagonGenWideningVecInstrPass(PassRegistry &);
+FunctionPass *createHexagonGenWideningVecInstr(const HexagonTargetMachine &);
+} // end namespace llvm
+
+namespace {
+
+class HexagonGenWideningVecInstr : public FunctionPass {
+public:
+  static char ID;
+
+  HexagonGenWideningVecInstr() : FunctionPass(ID) {
+    initializeHexagonGenWideningVecInstrPass(*PassRegistry::getPassRegistry());
+  }
+
+  HexagonGenWideningVecInstr(const HexagonTargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {
+    initializeHexagonGenWideningVecInstrPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon generate widening vector instructions";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  Module *M = nullptr;
+  const HexagonTargetMachine *TM = nullptr;
+  const HexagonSubtarget *HST = nullptr;
+  unsigned HwVLen;
+  enum OPKind { OP_None = 0, OP_Add, OP_Sub, OP_Mul, OP_Shl };
+
+  struct OPInfo {
+    Value *OP = nullptr;
+    Value *ExtInOP = nullptr;
+    bool IsZExt = false;
+    unsigned ExtInSize = 0;
+    bool IsScalar = false;
+  };
+
+  bool visitBlock(BasicBlock *B);
+  bool processInstruction(Instruction *Inst);
+  bool replaceWithIntrinsic(Instruction *Inst, OPKind OPK, OPInfo &OP1Info,
+                            OPInfo &OP2Info);
+  bool getOperandInfo(Value *V, OPInfo &OPI);
+  bool isExtendedConstant(Constant *C, bool IsSigned);
+  unsigned getElementSizeInBits(Value *V, bool IsZExt);
+  Type *getElementTy(unsigned size, IRBuilder<> &IRB);
+
+  Value *adjustExtensionForOp(OPInfo &OPI, IRBuilder<> &IRB,
+                              unsigned NewEltsize, unsigned NumElts);
+
+  Intrinsic::ID getIntrinsic(OPKind OPK, bool IsOP1ZExt, bool IsOP2ZExt,
+                             unsigned NewOpEltSize, unsigned NewResEltSize,
+                             bool IsConstScalar, int ConstOpNum);
+
+  std::pair<Value *, Value *> opSplit(Value *OP, Instruction *Inst,
+                                      Type *NewOpType);
+
+  Value *createIntrinsic(Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1,
+                         Value *NewOP2, Type *ResType, unsigned NumElts,
+                         bool Interleave);
+  bool processInstructionForVMPA(Instruction *Inst);
+  bool getVmpaOperandInfo(Value *V, OPInfo &OPI);
+  void reorderVmpaOperands(OPInfo *OPI);
+  bool replaceWithVmpaIntrinsic(Instruction *Inst, OPInfo *OPI);
+  bool genSaturatingInst(Instruction *Inst);
+  bool getMinMax(Constant *MinC, Constant *MaxC, std::pair<int, int> &MinMax);
+  bool isSaturatingVAsr(Instruction *Inst, Value *S, int MinV, int MaxV,
+                        bool &IsResSigned);
+  Value *extendShiftByVal(Value *ShiftByVal, IRBuilder<> &IRB);
+  Intrinsic::ID getVAsrIntrinsic(bool IsInSigned, bool IsResSigned);
+  Value *createVAsrIntrinsic(Instruction *Inst, Value *VecOP, Value *ShiftByVal,
+                             bool IsResSigned);
+  bool genVAvg(Instruction *Inst);
+  bool checkConstantVector(Value *OP, int64_t &SplatVal, bool IsOPZExt);
+  void updateMPYConst(Intrinsic::ID IntId, int64_t &SplatVal, bool IsOPZExt,
+                      Value *&OP, IRBuilder<> &IRB);
+  void packConstant(Intrinsic::ID IntId, int64_t &SplatVal, Value *&OP,
+                    IRBuilder<> &IRB);
+};
+
+} // end anonymous namespace
+
+char HexagonGenWideningVecInstr::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonGenWideningVecInstr, "widening-vec",
+                      "Hexagon generate "
+                      "widening vector instructions",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenWideningVecInstr, "widening-vec",
+                    "Hexagon generate "
+                    "widening vector instructions",
+                    false, false)
+
+static bool hasNegativeValues(Constant *C) {
+  if (Value *SplatV = C->getSplatValue()) {
+    auto *CI = dyn_cast<ConstantInt>(SplatV);
+    assert(CI);
+    return CI->getValue().isNegative();
+  }
+  unsigned NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    auto *CI = dyn_cast<ConstantInt>(C->getAggregateElement(i));
+    assert(CI);
+    if (CI->getValue().isNegative())
+      return true;
+    continue;
+  }
+  return false;
+}
+
+bool HexagonGenWideningVecInstr::getOperandInfo(Value *V, OPInfo &OPI) {
+  using namespace PatternMatch;
+  OPI.OP = V;
+  Value *ExtV = nullptr;
+  Constant *C = nullptr;
+
+  bool Match = false;
+  if ((Match = (match(V, (m_ZExt(m_Value(ExtV)))) ||
+                match(V, m_Shuffle(m_InsertElt(m_Poison(),
+                                               m_ZExt(m_Value(ExtV)), m_Zero()),
+                                   m_Poison(), m_ZeroMask()))))) {
+    OPI.ExtInOP = ExtV;
+    OPI.IsZExt = true;
+  }
+
+  if (!Match &&
+      (Match = (match(V, (m_SExt(m_Value(ExtV)))) ||
+                match(V, m_Shuffle(m_InsertElt(m_Poison(),
+                                               m_SExt(m_Value(ExtV)), m_Zero()),
+                                   m_Poison(), m_ZeroMask()))))) {
+    OPI.ExtInOP = ExtV;
+    OPI.IsZExt = false;
+  }
+  if (!Match &&
+      (Match =
+           (match(V, m_Shuffle(m_InsertElt(m_Poison(), m_Value(ExtV), m_Zero()),
+                               m_Poison(), m_ZeroMask()))))) {
+    if (match(ExtV, m_And(m_Value(), m_SpecificInt(255)))) {
+      OPI.ExtInOP = ExtV;
+      OPI.IsZExt = true;
+      OPI.ExtInSize = 8;
+      return true;
+    }
+    if (match(ExtV, m_And(m_Value(), m_SpecificInt(65535)))) {
+      OPI.ExtInOP = ExtV;
+      OPI.IsZExt = true;
+      OPI.ExtInSize = 16;
+      return true;
+    }
+    return false;
+  }
+
+  if (!Match && (Match = match(V, m_Constant(C)))) {
+    if (!isExtendedConstant(C, false) && !isExtendedConstant(C, true))
+      return false;
+    OPI.ExtInOP = C;
+    OPI.IsZExt = !hasNegativeValues(C);
+  }
+
+  if (!Match)
+    return false;
+
+  // If the operand is extended, find the element size of its input.
+  if (OPI.ExtInOP)
+    OPI.ExtInSize = getElementSizeInBits(OPI.ExtInOP, OPI.IsZExt);
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::isExtendedConstant(Constant *C,
+                                                    bool IsSigned) {
+  Type *CTy = cast<FixedVectorType>(C->getType())->getElementType();
+  unsigned EltSize = CTy->getPrimitiveSizeInBits();
+  unsigned HalfSize = EltSize / 2;
+  if (Value *SplatV = C->getSplatValue()) {
+    if (auto *CI = dyn_cast<ConstantInt>(SplatV))
+      return IsSigned ? isIntN(HalfSize, CI->getSExtValue())
+                      : isUIntN(HalfSize, CI->getZExtValue());
+    return false;
+  }
+  unsigned NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    if (auto *CI = dyn_cast<ConstantInt>(C->getAggregateElement(i))) {
+      if ((IsSigned && !isIntN(HalfSize, CI->getSExtValue())) ||
+          (!IsSigned && !isUIntN(HalfSize, CI->getZExtValue())))
+        return false;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+unsigned HexagonGenWideningVecInstr::getElementSizeInBits(Value *V,
+                                                          bool IsZExt = false) {
+  using namespace PatternMatch;
+  Type *ValTy = V->getType();
+  Type *EltTy = ValTy;
+  if (auto *C = dyn_cast<Constant>(V)) {
+    unsigned NumElts = cast<FixedVectorType>(EltTy)->getNumElements();
+    unsigned EltSize = cast<FixedVectorType>(EltTy)
+                           ->getElementType()
+                           ->getPrimitiveSizeInBits()
+                           .getKnownMinValue();
+    unsigned ReducedSize = EltSize / 2;
+
+    while (ReducedSize >= 8) {
+      for (unsigned i = 0, e = NumElts; i != e; ++i) {
+        if (auto *CI = dyn_cast<ConstantInt>(C->getAggregateElement(i))) {
+          if (IsZExt) {
+            if (!isUIntN(ReducedSize, CI->getZExtValue()))
+              return EltSize;
+          } else if (!isIntN(ReducedSize, CI->getSExtValue()))
+            return EltSize;
+        }
+      }
+      EltSize = ReducedSize;
+      ReducedSize = ReducedSize / 2;
+    }
+    return EltSize;
+  }
+
+  if (ValTy->isVectorTy())
+    EltTy = cast<FixedVectorType>(ValTy)->getElementType();
+  return EltTy->getPrimitiveSizeInBits();
+}
+
+Value *HexagonGenWideningVecInstr::adjustExtensionForOp(OPInfo &OPI,
+                                                        IRBuilder<> &IRB,
+                                                        unsigned NewExtSize,
+                                                        unsigned NumElts) {
+  Value *V = OPI.ExtInOP;
+  bool IsZExt = OPI.IsZExt;
+  unsigned EltSize = getElementSizeInBits(OPI.ExtInOP, OPI.IsZExt);
+  Type *EltType = getElementTy(NewExtSize, IRB);
+  auto *NewOpTy = FixedVectorType::get(EltType, NumElts);
+
+  if (dyn_cast<Constant>(V))
+    return IRB.CreateTrunc(V, NewOpTy);
+
+  if (V->getType()->isVectorTy()) {
+    if (NewExtSize == EltSize)
+      return V;
+    assert(NewExtSize == 16);
+    auto *NewOpTy = FixedVectorType::get(IRB.getInt16Ty(), NumElts);
+    return (IsZExt) ? IRB.CreateZExt(V, NewOpTy) : IRB.CreateSExt(V, NewOpTy);
+  }
+
+  // The operand must correspond to a shuffle vector which is used to construct
+  // a vector out of a scalar. Since the scalar value (V) is extended,
+  // replace it with a new shuffle vector with the smaller element size.
+  [[maybe_unused]] auto *I = dyn_cast<Instruction>(OPI.OP);
+  assert(I && I->getOpcode() == Instruction::ShuffleVector);
+
+  if (NewExtSize > EltSize)
+    V = (IsZExt) ? IRB.CreateZExt(V, EltType) : IRB.CreateSExt(V, EltType);
+  else if (NewExtSize < EltSize)
+    V = IRB.CreateTrunc(V, EltType);
+
+  Value *IE =
+      IRB.CreateInsertElement(PoisonValue::get(NewOpTy), V, IRB.getInt32(0));
+
+  SmallVector<Constant *, 8> ShuffleMask;
+  for (unsigned i = 0; i < NumElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(0));
+
+  return IRB.CreateShuffleVector(IE, PoisonValue::get(NewOpTy),
+                                 ConstantVector::get(ShuffleMask));
+}
+
+Intrinsic::ID HexagonGenWideningVecInstr::getIntrinsic(
+    OPKind OPK, bool IsOP1ZExt, bool IsOP2ZExt, unsigned InEltSize,
+    unsigned ResEltSize, bool IsConstScalar, int ConstOpNum) {
+  // Since the operands have been extended, the ResEltSize must be 16 or more.
+  switch (OPK) {
+  case OP_Add:
+    // Both operands should be either zero extended or sign extended.
+    assert(IsOP1ZExt == IsOP2ZExt);
+    if (InEltSize == 8 && ResEltSize == 16) {
+      // Operands must be zero extended as we don't have a widening vector
+      // 'add' that can take signed exteded values.
+      assert(IsOP1ZExt && "Operands must be zero-extended");
+      return Intrinsic::hexagon_vadd_uu;
+    }
+    if (InEltSize == 16 && ResEltSize == 32)
+      return (IsOP1ZExt) ? Intrinsic::hexagon_vadd_uu
+                         : Intrinsic::hexagon_vadd_ss;
+
+    llvm_unreachable("Incorrect input and output operand sizes");
+
+  case OP_Sub:
+    // Both operands should be either zero extended or sign extended.
+    assert(IsOP1ZExt == IsOP2ZExt);
+    if (InEltSize == 8 && ResEltSize == 16) {
+      // Operands must be zero extended as we don't have a widening vector
+      // 'sub' that can take signed exteded values.
+      assert(IsOP1ZExt && "Operands must be zero-extended");
+      return Intrinsic::hexagon_vsub_uu;
+    }
+    if (InEltSize == 16 && ResEltSize == 32)
+      return (IsOP1ZExt) ? Intrinsic::hexagon_vsub_uu
+                         : Intrinsic::hexagon_vsub_ss;
+
+    llvm_unreachable("Incorrect input and output operand sizes");
+
+  case OP_Mul:
+    assert(ResEltSize = 2 * InEltSize);
+    // Enter inside 'if' block when one of the operand is constant vector
+    if (IsConstScalar) {
+      // When inputs are of 8bit type and output is 16bit type, enter 'if' block
+      if (InEltSize == 8 && ResEltSize == 16) {
+        // Enter the 'if' block, when 2nd operand of the mul instruction is
+        // constant vector, otherwise enter 'else' block
+        if (ConstOpNum == 2 && IsOP1ZExt) {
+          // If the value inside the constant vector is zero-extended, then
+          // return hexagon_vmpy_ub_ub, else return hexagon_vmpy_ub_b
+          return (IsOP2ZExt) ? Intrinsic::hexagon_vmpy_ub_ub
+                             : Intrinsic::hexagon_vmpy_ub_b;
+        } else if (ConstOpNum == 1 && IsOP2ZExt) {
+          return (IsOP1ZExt) ? Intrinsic::hexagon_vmpy_ub_ub
+                             : Intrinsic::hexagon_vmpy_ub_b;
+        }
+      }
+      // When inputs are of 16bit type and output is 32bit type,
+      // enter 'if' block
+      if (InEltSize == 16 && ResEltSize == 32) {
+        if (IsOP1ZExt && IsOP2ZExt) {
+          // If the value inside the constant vector and other operand is
+          // zero-extended, then return hexagon_vmpy_uh_uh
+          return Intrinsic::hexagon_vmpy_uh_uh;
+        } else if (!IsOP1ZExt && !IsOP2ZExt) {
+          // If the value inside the constant vector and other operand is
+          // sign-extended, then return hexagon_vmpy_h_h
+          return Intrinsic::hexagon_vmpy_h_h;
+        }
+      }
+    }
+    if (IsOP1ZExt)
+      return IsOP2ZExt ? Intrinsic::hexagon_vmpy_uu
+                       : Intrinsic::hexagon_vmpy_us;
+    else
+      return IsOP2ZExt ? Intrinsic::hexagon_vmpy_su
+                       : Intrinsic::hexagon_vmpy_ss;
+  default:
+    llvm_unreachable("Instruction not handled!");
+  }
+}
+
+Type *HexagonGenWideningVecInstr::getElementTy(unsigned size,
+                                               IRBuilder<> &IRB) {
+  switch (size) {
+  case 8:
+    return IRB.getInt8Ty();
+  case 16:
+    return IRB.getInt16Ty();
+  case 32:
+    return IRB.getInt32Ty();
+  default:
+    llvm_unreachable("Unhandled Element size");
+  }
+}
+
+Value *HexagonGenWideningVecInstr::createIntrinsic(
+    Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1, Value *NewOP2,
+    Type *ResType, unsigned NumElts, bool Interleave = true) {
+  IRBuilder<> IRB(Inst);
+  Function *ExtF = Intrinsic::getOrInsertDeclaration(M, IntId, ResType);
+  Value *NewIn = IRB.CreateCall(ExtF, {NewOP1, NewOP2});
+  if (Interleave) {
+    // Interleave elements in the output vector.
+    SmallVector<Constant *, 16> ShuffleMask;
+    unsigned HalfElts = NumElts / 2;
+    for (unsigned i = 0; i < HalfElts; ++i) {
+      ShuffleMask.push_back(IRB.getInt32(i));
+      ShuffleMask.push_back(IRB.getInt32(HalfElts + i));
+    }
+    NewIn = IRB.CreateShuffleVector(NewIn, PoisonValue::get(ResType),
+                                    ConstantVector::get(ShuffleMask));
+  }
+  return NewIn;
+}
+
+std::pair<Value *, Value *>
+HexagonGenWideningVecInstr::opSplit(Value *OP, Instruction *Inst,
+                                    Type *NewOpType) {
+  Type *InstTy = Inst->getType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  IRBuilder<> IRB(Inst);
+  if (InstTy->getPrimitiveSizeInBits() < 2 * HwVLen) {
+    // The only time we need to split an OP even though it is not a
+    // vector-pair is while generating vasr instruction for the short vector.
+    // Since hi/lo intrinsics can't be used here as they expect the operands to
+    // be of 64xi32 type, the shuffle_vector pair with the appropriate masks is
+    // used instead.
+    assert(NumElts % 2 == 0 && "Unexpected Vector Type!!");
+    unsigned HalfElts = NumElts / 2;
+    SmallVector<Constant *, 8> HiM;
+    SmallVector<Constant *, 8> LoM;
+    for (unsigned i = 0; i < HalfElts; ++i)
+      LoM.push_back(IRB.getInt32(i));
+    for (unsigned i = 0; i < HalfElts; ++i)
+      HiM.push_back(IRB.getInt32(HalfElts + i));
+
+    Value *Hi = IRB.CreateShuffleVector(OP, PoisonValue::get(OP->getType()),
+                                        ConstantVector::get(HiM));
+    Value *Lo = IRB.CreateShuffleVector(OP, PoisonValue::get(OP->getType()),
+                                        ConstantVector::get(LoM));
+    return std::pair<Value *, Value *>(Hi, Lo);
+  }
+
+  Intrinsic::ID IntHi = Intrinsic::hexagon_V6_hi_128B;
+  Intrinsic::ID IntLo = Intrinsic::hexagon_V6_lo_128B;
+  Function *ExtFHi = Intrinsic::getOrInsertDeclaration(M, IntHi);
+  Function *ExtFLo = Intrinsic::getOrInsertDeclaration(M, IntLo);
+  auto *InType = FixedVectorType::get(IRB.getInt32Ty(), 64);
+  OP = IRB.CreateBitCast(OP, InType);
+  Value *Hi = IRB.CreateCall(ExtFHi, {OP}); // 32xi32
+  Value *Lo = IRB.CreateCall(ExtFLo, {OP});
+  Hi = IRB.CreateBitCast(Hi, NewOpType);
+  Lo = IRB.CreateBitCast(Lo, NewOpType);
+  return std::pair<Value *, Value *>(Hi, Lo);
+}
+
+bool HexagonGenWideningVecInstr::checkConstantVector(Value *OP,
+                                                     int64_t &SplatVal,
+                                                     bool IsOPZExt) {
+  if (auto *C1 = dyn_cast<Constant>(OP)) {
+    if (Value *SplatV = C1->getSplatValue()) {
+      auto *CI = dyn_cast<ConstantInt>(SplatV);
+      if (IsOPZExt) {
+        SplatVal = CI->getZExtValue();
+      } else {
+        SplatVal = CI->getSExtValue();
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+void HexagonGenWideningVecInstr::updateMPYConst(Intrinsic::ID IntId,
+                                                int64_t &SplatVal,
+                                                bool IsOPZExt, Value *&OP,
+                                                IRBuilder<> &IRB) {
+  if ((IntId == Intrinsic::hexagon_vmpy_uu ||
+       IntId == Intrinsic::hexagon_vmpy_us ||
+       IntId == Intrinsic::hexagon_vmpy_su ||
+       IntId == Intrinsic::hexagon_vmpy_ss) &&
+      OP->getType()->isVectorTy()) {
+    // Create a vector with all elements equal to SplatVal
+    auto *VecTy = cast<VectorType>(OP->getType());
+    Value *scalar = IRB.getIntN(VecTy->getScalarSizeInBits(),
+                                static_cast<uint32_t>(SplatVal));
+    Value *splatVector = ConstantVector::getSplat(VecTy->getElementCount(),
+                                                  cast<Constant>(scalar));
+    OP = IsOPZExt ? IRB.CreateZExt(splatVector, VecTy)
+                  : IRB.CreateSExt(splatVector, VecTy);
+  } else {
+    packConstant(IntId, SplatVal, OP, IRB);
+  }
+}
+
+void HexagonGenWideningVecInstr::packConstant(Intrinsic::ID IntId,
+                                              int64_t &SplatVal, Value *&OP,
+                                              IRBuilder<> &IRB) {
+  uint32_t Val32 = static_cast<uint32_t>(SplatVal);
+  if (IntId == Intrinsic::hexagon_vmpy_ub_ub) {
+    assert(SplatVal >= 0 && SplatVal <= UINT8_MAX);
+    uint32_t packed = (Val32 << 24) | (Val32 << 16) | (Val32 << 8) | Val32;
+    OP = IRB.getInt32(packed);
+  } else if (IntId == Intrinsic::hexagon_vmpy_ub_b) {
+    assert(SplatVal >= INT8_MIN && SplatVal <= INT8_MAX);
+    uint32_t packed = (Val32 << 24) | ((Val32 << 16) & ((1 << 24) - 1)) |
+                      ((Val32 << 8) & ((1 << 16) - 1)) |
+                      (Val32 & ((1 << 8) - 1));
+    OP = IRB.getInt32(packed);
+  } else if (IntId == Intrinsic::hexagon_vmpy_uh_uh) {
+    assert(SplatVal >= 0 && SplatVal <= UINT16_MAX);
+    uint32_t packed = (Val32 << 16) | Val32;
+    OP = IRB.getInt32(packed);
+  } else if (IntId == Intrinsic::hexagon_vmpy_h_h) {
+    assert(SplatVal >= INT16_MIN && SplatVal <= INT16_MAX);
+    uint32_t packed = (Val32 << 16) | (Val32 & ((1 << 16) - 1));
+    OP = IRB.getInt32(packed);
+  }
+}
+
+bool HexagonGenWideningVecInstr::replaceWithIntrinsic(Instruction *Inst,
+                                                      OPKind OPK,
+                                                      OPInfo &OP1Info,
+                                                      OPInfo &OP2Info) {
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  bool IsOP1ZExt = OP1Info.IsZExt;
+  bool IsOP2ZExt = OP2Info.IsZExt;
+
+  // The resulting values of 'add' and 'sub' are always sign-extended.
+  bool IsResZExt = (OPK == OP_Mul || OPK == OP_Shl)
+                       ? (OP1Info.IsZExt && OP2Info.IsZExt)
+                       : false;
+
+  unsigned MaxEltSize = std::max(OP1Info.ExtInSize, OP2Info.ExtInSize);
+  unsigned NewOpEltSize = MaxEltSize;
+  unsigned NewResEltSize = 2 * MaxEltSize;
+
+  // For Add and Sub, both the operands should be either zero extended
+  // or sign extended. In case of a mismatch, they are extended  to the
+  // next size (ex: 8 bits -> 16 bits) so that the sign-extended vadd/vsub
+  // instructions can be used. Also, we don't support 8-bits signed vadd/vsub
+  // instructions. They are extended to 16-bits and then signed 16-bits
+  // non-widening vadd/vsub is used to perform the operation.
+  if (OPK != OP_Mul && OPK != OP_Shl &&
+      (IsOP1ZExt != IsOP2ZExt || (!IsOP1ZExt && NewOpEltSize == 8)))
+    NewOpEltSize = 2 * NewOpEltSize;
+
+  unsigned ResVLen = NewResEltSize * NumElts;
+  if (ResVLen < HwVLen && !WidenShortVector)
+    return false;
+  if (NewOpEltSize > 16 || ((ResVLen > HwVLen) && (ResVLen % HwVLen) != 0))
+    return false;
+
+  IRBuilder<> IRB(Inst);
+  Value *NewOP1 = adjustExtensionForOp(OP1Info, IRB, NewOpEltSize, NumElts);
+  Value *NewOP2 = adjustExtensionForOp(OP2Info, IRB, NewOpEltSize, NumElts);
+
+  if (NewOpEltSize == NewResEltSize) {
+    assert(OPK != OP_Mul && OPK != OP_Shl);
+    // Instead of intrinsics, use vector add/sub.
+    Value *NewIn = IRB.CreateBinOp(cast<BinaryOperator>(Inst)->getOpcode(),
+                                   NewOP1, NewOP2);
+    if (InstEltSize > NewResEltSize)
+      NewIn = IRB.CreateSExt(NewIn, InstTy);
+    Inst->replaceAllUsesWith(NewIn);
+    return true;
+  }
+
+  bool IsConstScalar = false;
+  int64_t SplatVal = 0;
+  int ConstOpNum = 1;
+  if (OPK == OP_Mul || OPK == OP_Shl) {
+    IsConstScalar = checkConstantVector(NewOP1, SplatVal, IsOP1ZExt);
+    if (!IsConstScalar) {
+      IsConstScalar = checkConstantVector(NewOP2, SplatVal, IsOP2ZExt);
+      ConstOpNum = 2;
+    }
+  }
+
+  if (IsConstScalar && OPK == OP_Shl) {
+    if (((NewOpEltSize == 8) && (SplatVal > 0) && (SplatVal < 8)) ||
+        ((NewOpEltSize == 16) && (SplatVal > 0) && (SplatVal < 16))) {
+      SplatVal = 1 << SplatVal;
+      OPK = OP_Mul;
+    } else {
+      return false;
+    }
+  } else if (!IsConstScalar && OPK == OP_Shl) {
+    return false;
+  }
+
+  Intrinsic::ID IntId = getIntrinsic(OPK, IsOP1ZExt, IsOP2ZExt, NewOpEltSize,
+                                     NewResEltSize, IsConstScalar, ConstOpNum);
+
+  if (IsConstScalar) {
+    updateMPYConst(IntId, SplatVal, IsOP2ZExt, NewOP2, IRB);
+  }
+
+  // Split the node if it needs more than a vector pair for the result.
+  if (ResVLen > 2 * HwVLen) {
+    assert(ResVLen == 4 * HwVLen);
+    // Split the operands
+    unsigned HalfElts = NumElts / 2;
+    auto *NewOpType =
+        FixedVectorType::get(getElementTy(NewOpEltSize, IRB), HalfElts);
+    auto *ResType =
+        FixedVectorType::get(getElementTy(NewResEltSize, IRB), HalfElts);
+    std::pair<Value *, Value *> SplitOP1 = opSplit(NewOP1, Inst, NewOpType);
+    std::pair<Value *, Value *> SplitOP2;
+    if (IsConstScalar && (IntId == Intrinsic::hexagon_vmpy_h_h ||
+                          IntId == Intrinsic::hexagon_vmpy_uh_uh)) {
+      SplitOP2 = std::pair<Value *, Value *>(NewOP2, NewOP2);
+    } else {
+      SplitOP2 = opSplit(NewOP2, Inst, NewOpType);
+    }
+    Value *NewInHi = createIntrinsic(IntId, Inst, SplitOP1.first,
+                                     SplitOP2.first, ResType, HalfElts, true);
+    Value *NewInLo = createIntrinsic(IntId, Inst, SplitOP1.second,
+                                     SplitOP2.second, ResType, HalfElts, true);
+    assert(InstEltSize == NewResEltSize);
+    SmallVector<Constant *, 8> ShuffleMask;
+    for (unsigned i = 0; i < NumElts; ++i)
+      ShuffleMask.push_back(IRB.getInt32(i));
+    // Concat Hi and Lo.
+    Value *NewIn = IRB.CreateShuffleVector(NewInLo, NewInHi,
+                                           ConstantVector::get(ShuffleMask));
+
+    Inst->replaceAllUsesWith(NewIn);
+    return true;
+  }
+
+  auto *ResType =
+      FixedVectorType::get(getElementTy(NewResEltSize, IRB), NumElts);
+  Value *NewIn =
+      createIntrinsic(IntId, Inst, NewOP1, NewOP2, ResType, NumElts, true);
+  if (InstEltSize > NewResEltSize)
+    NewIn = (IsResZExt) ? IRB.CreateZExt(NewIn, InstTy)
+                        : IRB.CreateSExt(NewIn, InstTy);
+
+  Inst->replaceAllUsesWith(NewIn);
+
+  return true;
+}
+
+// Process instruction and replace them with widening vector
+// intrinsics if possible.
+bool HexagonGenWideningVecInstr::processInstruction(Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  if (!InstTy->isVectorTy() ||
+      cast<FixedVectorType>(InstTy)->getNumElements() > 128)
+    return false;
+  unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+  if (!HST->isTypeForHVX(cast<VectorType>(InstTy)) && InstLen != 4 * HwVLen)
+    return false;
+  if (InstLen < HwVLen && !WidenShortVector)
+    return false;
+
+  using namespace PatternMatch;
+
+  OPKind OPK;
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  if (match(Inst, (m_Sub(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Sub;
+  else if (match(Inst, (m_Add(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Add;
+  else if (match(Inst, (m_Mul(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Mul;
+  else if (match(Inst, (m_Shl(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Shl;
+  else
+    return false;
+
+  OPInfo OP1Info, OP2Info;
+
+  if (!getOperandInfo(OP1, OP1Info) || !getOperandInfo(OP2, OP2Info))
+    return false;
+
+  // Proceed only if both input operands are extended.
+  if (!OP1Info.ExtInOP || !OP2Info.ExtInOP)
+    return false;
+
+  return replaceWithIntrinsic(Inst, OPK, OP1Info, OP2Info);
+}
+
+bool HexagonGenWideningVecInstr::getVmpaOperandInfo(Value *V, OPInfo &OPI) {
+  using namespace PatternMatch;
+  OPI.OP = V;
+  Value *ExtV, *OP1 = nullptr;
+
+  if (match(V,
+            m_ZExt(m_Shuffle(m_InsertElt(m_Poison(), m_Value(ExtV), m_Zero()),
+                             m_Poison(), m_ZeroMask()))) ||
+      match(V,
+            m_Shuffle(m_InsertElt(m_Poison(), m_ZExt(m_Value(ExtV)), m_Zero()),
+                      m_Poison(), m_ZeroMask()))) {
+    OPI.ExtInOP = ExtV;
+    OPI.IsZExt = true;
+    OPI.IsScalar = true;
+    OPI.ExtInSize = ExtV->getType()->getPrimitiveSizeInBits();
+    return true;
+  }
+
+  ConstantInt *I = nullptr;
+  if ((match(V, m_Shuffle(m_InsertElt(m_Poison(), m_Value(ExtV), m_Zero()),
+                          m_Poison(), m_ZeroMask())))) {
+    if (match(ExtV, m_And(m_Value(OP1), m_ConstantInt(I)))) {
+      uint32_t IValue = I->getZExtValue();
+      if (IValue <= 255) {
+        OPI.ExtInOP = ExtV;
+        OPI.IsZExt = true;
+        OPI.ExtInSize = 8;
+        OPI.IsScalar = true;
+        return true;
+      }
+    }
+  }
+
+  // Match for non-scalar operands
+  return getOperandInfo(V, OPI);
+}
+
+// Process instruction and replace with the vmpa intrinsic if possible.
+bool HexagonGenWideningVecInstr::processInstructionForVMPA(Instruction *Inst) {
+  using namespace PatternMatch;
+  Type *InstTy = Inst->getType();
+  // TODO: Extend it to handle short vector instructions (< HwVLen).
+  // vmpa instructions produce a vector register pair.
+  if (!InstTy->isVectorTy() || InstTy->getPrimitiveSizeInBits() != 2 * HwVLen)
+    return false;
+
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  if (!match(Inst, (m_Add(m_Value(OP1), m_Value(OP2)))))
+    return false;
+
+  Value *OP[4] = {nullptr, nullptr, nullptr, nullptr};
+  if (!match(OP1, m_Mul(m_Value(OP[0]), m_Value(OP[1]))) ||
+      !match(OP2, m_Mul(m_Value(OP[2]), m_Value(OP[3]))))
+    return false;
+
+  OPInfo OP_Info[4];
+  for (unsigned i = 0; i < 4; i++)
+    if (!getVmpaOperandInfo(OP[i], OP_Info[i]) || !OP_Info[i].ExtInOP)
+      return false;
+
+  return replaceWithVmpaIntrinsic(Inst, OP_Info);
+}
+
+// Reorder operand info in OPI so that the vector operands come before their
+// scalar counterparts.
+void HexagonGenWideningVecInstr::reorderVmpaOperands(OPInfo *OPI) {
+  for (unsigned i = 0; i < 2; i++)
+    if (!OPI[2 * i].ExtInOP->getType()->isVectorTy()) {
+      OPInfo Temp;
+      Temp = OPI[2 * i];
+      OPI[2 * i] = OPI[2 * i + 1];
+      OPI[2 * i + 1] = Temp;
+    }
+}
+
+// Only handles the case where one input to vmpa has to be a scalar
+// and another is a vector. It can be easily extended to cover
+// other types of vmpa instructions.
+bool HexagonGenWideningVecInstr::replaceWithVmpaIntrinsic(Instruction *Inst,
+                                                          OPInfo *OPI) {
+  reorderVmpaOperands(OPI);
+
+  // After reordering of the operands in OPI, the odd elements must have
+  // IsScalar flag set to true. Also, check the even elements for non-scalars.
+  if (!OPI[1].IsScalar || !OPI[3].IsScalar || OPI[0].IsScalar ||
+      OPI[2].IsScalar)
+    return false;
+
+  OPInfo SOPI1 = OPI[1];
+  OPInfo SOPI2 = OPI[3];
+
+  // The scalar operand in the vmpa instructions needs to be an int8.
+  if (SOPI1.ExtInSize != SOPI2.ExtInSize || SOPI1.ExtInSize != 8)
+    return false;
+
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  unsigned MaxVEltSize = std::max(OPI[0].ExtInSize, OPI[2].ExtInSize);
+  unsigned NewVOpEltSize = MaxVEltSize;
+  unsigned NewResEltSize = 2 * MaxVEltSize;
+
+  if (NumElts * NewVOpEltSize < HwVLen) {
+    // Extend the operand so that we don't end up with an invalid vector size.
+    NewVOpEltSize = 2 * NewVOpEltSize;
+    NewResEltSize = 2 * NewResEltSize;
+  }
+
+  IRBuilder<> IRB(Inst);
+
+  // Construct scalar operand
+  Value *NewSOP1 = SOPI1.ExtInOP;
+  Value *NewSOP2 = SOPI2.ExtInOP;
+
+  Type *S1Ty = NewSOP1->getType();
+  Type *S2Ty = NewSOP2->getType();
+  if (S1Ty->getPrimitiveSizeInBits() < 32)
+    NewSOP1 = IRB.CreateZExt(NewSOP1, IRB.getInt32Ty());
+  if (S2Ty->getPrimitiveSizeInBits() < 32)
+    NewSOP2 = IRB.CreateZExt(NewSOP2, IRB.getInt32Ty());
+
+  Value *SHL = IRB.CreateShl(NewSOP1, IRB.getInt32(8));
+  Value *OR = IRB.CreateOr(SHL, NewSOP2);
+  Intrinsic::ID CombineIntID = Intrinsic::hexagon_A2_combine_ll;
+  Function *ExtF = Intrinsic::getOrInsertDeclaration(M, CombineIntID);
+  Value *ScalarOP = IRB.CreateCall(ExtF, {OR, OR});
+
+  // Construct vector operand
+  Value *NewVOP1 = adjustExtensionForOp(OPI[0], IRB, NewVOpEltSize, NumElts);
+  Value *NewVOP2 = adjustExtensionForOp(OPI[2], IRB, NewVOpEltSize, NumElts);
+
+  // Combine both vector operands to form the vector-pair for vmpa
+  Intrinsic::ID VCombineIntID = Intrinsic::hexagon_V6_vcombine_128B;
+  ExtF = Intrinsic::getOrInsertDeclaration(M, VCombineIntID);
+  Type *InType = FixedVectorType::get(IRB.getInt32Ty(), 32);
+  NewVOP1 = IRB.CreateBitCast(NewVOP1, InType);
+  NewVOP2 = IRB.CreateBitCast(NewVOP2, InType);
+  Value *VecOP = IRB.CreateCall(ExtF, {NewVOP1, NewVOP2});
+
+  Intrinsic::ID VmpaIntID =
+      (NewResEltSize == 16) ? VmpaIntID = Intrinsic::hexagon_V6_vmpabus_128B
+                            : VmpaIntID = Intrinsic::hexagon_V6_vmpauhb_128B;
+  ExtF = Intrinsic::getOrInsertDeclaration(M, VmpaIntID);
+  auto *ResType =
+      FixedVectorType::get(getElementTy(NewResEltSize, IRB), NumElts);
+  Value *NewIn = IRB.CreateCall(ExtF, {VecOP, ScalarOP});
+  NewIn = IRB.CreateBitCast(NewIn, ResType);
+
+  if (InstEltSize > NewResEltSize)
+    // Extend the output to match the original instruction type.
+    NewIn = IRB.CreateSExt(NewIn, InstTy);
+
+  // Interleave elements in the output vector.
+  SmallVector<Constant *, 16> ShuffleMask;
+  unsigned HalfElts = NumElts / 2;
+  for (unsigned i = 0; i < HalfElts; ++i) {
+    ShuffleMask.push_back(IRB.getInt32(i));
+    ShuffleMask.push_back(IRB.getInt32(HalfElts + i));
+  }
+  NewIn = IRB.CreateShuffleVector(NewIn, PoisonValue::get(ResType),
+                                  ConstantVector::get(ShuffleMask));
+
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::genSaturatingInst(Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  assert(InstTy->isVectorTy());
+  if (InstTy->getPrimitiveSizeInBits() > HwVLen)
+    return false;
+
+  using namespace PatternMatch;
+  CmpPredicate P1, P2;
+  Value *L1 = nullptr, *T1 = nullptr, *L2 = nullptr, *T2 = nullptr,
+        *L3 = nullptr;
+  Constant *RC1 = nullptr, *FC1 = nullptr, *RC2 = nullptr, *FC2 = nullptr,
+           *RC3 = nullptr;
+
+  // Pattern of interest: ashr -> llvm.smin -> llvm.smax -> trunc
+  // Match trunc instruction
+  if (match(Inst, m_Trunc(m_Intrinsic<Intrinsic::smax>(m_Value(L1),
+                                                       m_Constant(RC1))))) {
+    // Match llvm.smin instruction
+    if (match(L1, m_Intrinsic<Intrinsic::smin>(m_Value(L2), m_Constant(RC2)))) {
+      // Match ashr instruction
+      if (match(L2, m_AShr(m_Value(L3), m_Constant(RC3)))) {
+        std::pair<int, int> MinMax;
+        // get min, max values from operatands of smin and smax
+        if (getMinMax(RC1, RC2, MinMax)) {
+          bool IsResSigned;
+          // Validate the saturating vasr pattern
+          if (isSaturatingVAsr(Inst, L2, MinMax.first, MinMax.second,
+                               IsResSigned)) {
+            // Get the shift value from the ashr operand
+            ConstantInt *shift_val =
+                dyn_cast<ConstantInt>(RC3->getSplatValue());
+            if (shift_val) {
+              Value *NewIn =
+                  createVAsrIntrinsic(Inst, L3, shift_val, IsResSigned);
+              Inst->replaceAllUsesWith(NewIn);
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (!match(Inst, (m_Trunc(m_Select(m_ICmp(P1, m_Value(L1), m_Constant(RC1)),
+                                     m_Value(T1), m_Constant(FC1))))) ||
+      (T1 != L1 || FC1 != RC1))
+    return false;
+
+  if (!match(L1, m_Select(m_ICmp(P2, m_Value(L2), m_Constant(RC2)), m_Value(T2),
+                          m_Constant(FC2))) ||
+      (T2 != L2 || FC2 != RC2))
+    return false;
+
+  if (!((P1 == CmpInst::ICMP_SGT && P2 == CmpInst::ICMP_SLT) ||
+        (P1 == CmpInst::ICMP_SLT && P2 == CmpInst::ICMP_SGT)))
+    return false;
+
+  std::pair<int, int> MinMax;
+  if ((P1 == CmpInst::ICMP_SGT) && (P2 == CmpInst::ICMP_SLT)) {
+    if (!getMinMax(RC1, RC2, MinMax))
+      return false;
+  } else if (!getMinMax(RC2, RC1, MinMax))
+    return false;
+
+  Value *S = L2; // Value being saturated
+
+  // Only AShr instructions are handled.
+  // Also, second operand to AShr must be a scalar.
+  Value *OP1 = nullptr, *ShiftByVal = nullptr;
+  if (!match(S, m_AShr(m_Value(OP1),
+                       m_Shuffle(m_InsertElt(m_Poison(), m_Value(ShiftByVal),
+                                             m_Zero()),
+                                 m_Poison(), m_ZeroMask()))))
+    return false;
+
+  bool IsResSigned;
+  if (!isSaturatingVAsr(Inst, S, MinMax.first, MinMax.second, IsResSigned))
+    return false;
+
+  Value *NewIn = createVAsrIntrinsic(Inst, OP1, ShiftByVal, IsResSigned);
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+Value *HexagonGenWideningVecInstr::extendShiftByVal(Value *ShiftByVal,
+                                                    IRBuilder<> &IRB) {
+  using namespace PatternMatch;
+  Value *A = nullptr;
+  if (match(ShiftByVal, m_Trunc(m_Value(A))))
+    return A;
+  return IRB.CreateZExt(ShiftByVal, IRB.getInt32Ty());
+}
+
+bool HexagonGenWideningVecInstr::getMinMax(Constant *MinC, Constant *MaxC,
+                                           std::pair<int, int> &MinMax) {
+  Value *SplatV;
+  if (!(SplatV = MinC->getSplatValue()) || !(dyn_cast<ConstantInt>(SplatV)))
+    return false;
+  if (!(SplatV = MaxC->getSplatValue()) || !(dyn_cast<ConstantInt>(SplatV)))
+    return false;
+
+  ConstantInt *MinI = dyn_cast<ConstantInt>(MinC->getSplatValue());
+  ConstantInt *MaxI = dyn_cast<ConstantInt>(MaxC->getSplatValue());
+  MinMax = std::pair<int, int>(MinI->getSExtValue(), MaxI->getSExtValue());
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::isSaturatingVAsr(Instruction *Inst, Value *S,
+                                                  int MinV, int MaxV,
+                                                  bool &IsResSigned) {
+  if (MinV >= MaxV)
+    return false;
+
+  IsResSigned = true;
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<VectorType>(InstTy)->getElementType();
+  unsigned TruncSize = EltTy->getPrimitiveSizeInBits();
+
+  int MaxRange, MinRange;
+  if (MinV < 0) { // Saturate to a signed value
+    MaxRange = (1 << (TruncSize - 1)) - 1;
+    MinRange = -(1 << (TruncSize - 1));
+  } else if (MinV == 0) { // Saturate to an unsigned value
+    MaxRange = (1 << (TruncSize)) - 1;
+    MinRange = 0;
+    IsResSigned = false;
+  } else
+    return false;
+
+  if (MinV != MinRange || MaxV != MaxRange)
+    return false;
+
+  auto *SInst = dyn_cast<Instruction>(S);
+  if (SInst->getOpcode() == Instruction::AShr) {
+    Type *SInstTy = SInst->getType();
+    Type *SEltTy = cast<VectorType>(SInstTy)->getElementType();
+    unsigned SInstEltSize = SEltTy->getPrimitiveSizeInBits();
+    if (SInstEltSize != 2 * TruncSize || TruncSize > 16)
+      return false;
+  }
+  return true;
+}
+
+Intrinsic::ID HexagonGenWideningVecInstr::getVAsrIntrinsic(bool IsInSigned,
+                                                           bool IsResSigned) {
+  if (!IsResSigned)
+    return (IsInSigned) ? Intrinsic::hexagon_vasrsat_su
+                        : Intrinsic::hexagon_vasrsat_uu;
+  return Intrinsic::hexagon_vasrsat_ss;
+}
+
+Value *HexagonGenWideningVecInstr::createVAsrIntrinsic(Instruction *Inst,
+                                                       Value *VecOP,
+                                                       Value *ShiftByVal,
+                                                       bool IsResSigned) {
+  IRBuilder<> IRB(Inst);
+  Type *ShiftByTy = ShiftByVal->getType();
+  if (ShiftByTy->getPrimitiveSizeInBits() < 32)
+    ShiftByVal = extendShiftByVal(ShiftByVal, IRB);
+
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  // Replace the instruction with saturating vasr intrinsic.
+  // Since vasr with saturation interleaves elements from both input vectors,
+  // they must be deinterleaved for output to end up in the right order.
+  SmallVector<Constant *, 16> ShuffleMask;
+  unsigned HalfElts = NumElts / 2;
+  // Even elements
+  for (unsigned i = 0; i < HalfElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(i * 2));
+  // Odd elements
+  for (unsigned i = 0; i < HalfElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(i * 2 + 1));
+
+  VecOP = IRB.CreateShuffleVector(VecOP, PoisonValue::get(VecOP->getType()),
+                                  ConstantVector::get(ShuffleMask));
+
+  auto *InVecOPTy =
+      FixedVectorType::get(getElementTy(InstEltSize * 2, IRB), HalfElts);
+  std::pair<Value *, Value *> HiLo = opSplit(VecOP, Inst, InVecOPTy);
+  Intrinsic::ID IntID = getVAsrIntrinsic(true, IsResSigned);
+  Function *F = Intrinsic::getOrInsertDeclaration(M, IntID, InVecOPTy);
+  Value *NewIn = IRB.CreateCall(F, {HiLo.first, HiLo.second, ShiftByVal});
+  return IRB.CreateBitCast(NewIn, InstTy);
+}
+
+// Generate vavg instruction.
+bool HexagonGenWideningVecInstr::genVAvg(Instruction *Inst) {
+  using namespace PatternMatch;
+  Type *InstTy = Inst->getType();
+  assert(InstTy->isVectorTy());
+
+  bool Match = false;
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  bool IsSigned;
+  if ((Match = (match(Inst, m_Trunc(m_LShr(m_Add(m_ZExt(m_Value(OP1)),
+                                                 m_ZExt(m_Value(OP2))),
+                                           m_SpecificInt(1)))))))
+    IsSigned = false;
+  if (!Match &&
+      (Match = (match(Inst, m_Trunc(m_LShr(m_Add(m_SExt(m_Value(OP1)),
+                                                 m_SExt(m_Value(OP2))),
+                                           m_SpecificInt(1))))) ||
+               match(Inst, m_LShr(m_Add(m_Value(OP1), m_Value(OP2)),
+                                  m_SpecificInt(1)))))
+    IsSigned = true;
+
+  if (!Match)
+    return false;
+
+  unsigned OP1EltSize = getElementSizeInBits(OP1);
+  unsigned OP2EltSize = getElementSizeInBits(OP2);
+  unsigned NewEltSize = std::max(OP1EltSize, OP2EltSize);
+
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+  unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+
+  // Only vectors that are either smaller, same or twice of the hardware
+  // vector length are allowed.
+  if (InstEltSize < NewEltSize || (InstLen > 2 * HwVLen))
+    return false;
+
+  if ((InstLen > HwVLen) && (InstLen % HwVLen != 0))
+    return false;
+
+  IRBuilder<> IRB(Inst);
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  auto *AvgInstTy =
+      FixedVectorType::get(getElementTy(NewEltSize, IRB), NumElts);
+  if (OP1EltSize < NewEltSize)
+    OP1 = (IsSigned) ? IRB.CreateSExt(OP1, AvgInstTy)
+                     : IRB.CreateZExt(OP1, AvgInstTy);
+  if (OP2EltSize < NewEltSize)
+    OP2 = (IsSigned) ? IRB.CreateSExt(OP2, AvgInstTy)
+                     : IRB.CreateZExt(OP2, AvgInstTy);
+
+  Intrinsic::ID AvgIntID =
+      (IsSigned) ? Intrinsic::hexagon_vavgs : Intrinsic::hexagon_vavgu;
+  Value *NewIn = nullptr;
+
+  // Split operands if they need more than a vector length.
+  if (NewEltSize * NumElts > HwVLen) {
+    unsigned HalfElts = NumElts / 2;
+    auto *ResType =
+        FixedVectorType::get(getElementTy(NewEltSize, IRB), HalfElts);
+    std::pair<Value *, Value *> SplitOP1 = opSplit(OP1, Inst, ResType);
+    std::pair<Value *, Value *> SplitOP2 = opSplit(OP2, Inst, ResType);
+    Value *NewHi = createIntrinsic(AvgIntID, Inst, SplitOP1.first,
+                                   SplitOP2.first, ResType, NumElts, false);
+    Value *NewLo = createIntrinsic(AvgIntID, Inst, SplitOP1.second,
+                                   SplitOP2.second, ResType, NumElts, false);
+    SmallVector<Constant *, 8> ShuffleMask;
+    for (unsigned i = 0; i < NumElts; ++i)
+      ShuffleMask.push_back(IRB.getInt32(i));
+    // Concat Hi and Lo.
+    NewIn =
+        IRB.CreateShuffleVector(NewLo, NewHi, ConstantVector::get(ShuffleMask));
+  } else
+    NewIn =
+        createIntrinsic(AvgIntID, Inst, OP1, OP2, AvgInstTy, NumElts, false);
+
+  if (InstEltSize > NewEltSize)
+    // Extend the output to match the original instruction type.
+    NewIn = (IsSigned) ? IRB.CreateSExt(NewIn, InstTy)
+                       : IRB.CreateZExt(NewIn, InstTy);
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::visitBlock(BasicBlock *B) {
+  bool Changed = false;
+  for (auto &I : *B) {
+    Type *InstTy = I.getType();
+    if (!InstTy->isVectorTy() || !HST->isTypeForHVX(cast<VectorType>(InstTy)))
+      continue;
+
+    unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+    if (InstLen < HwVLen && !WidenShortVector)
+      continue;
+
+    Changed |= processInstructionForVMPA(&I);
+    Changed |= genSaturatingInst(&I);
+    Changed |= genVAvg(&I);
+  }
+  // Generate widening instructions.
+  for (auto &I : *B)
+    Changed |= processInstruction(&I);
+  return Changed;
+}
+
+bool HexagonGenWideningVecInstr::runOnFunction(Function &F) {
+  M = F.getParent();
+  HST = TM->getSubtargetImpl(F);
+
+  // Return if useHVX128BOps is not set. It can be enabled for 64B mode
+  // but wil require some changes. For example, bitcast for intrinsics
+  // assumes 128B mode.
+  if (skipFunction(F) || !HST->useHVX128BOps())
+    return false;
+
+  HwVLen = HST->getVectorLength() * 8; // Vector Length in bits
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= visitBlock(&B);
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createHexagonGenWideningVecInstr(const HexagonTargetMachine &TM) {
+  return new HexagonGenWideningVecInstr(&TM);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index f4d2a79051c10..5b8ba99e978c9 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -592,6 +592,7 @@ class HexagonTargetLowering : public TargetLowering {
   SDValue WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue WidenHvxStore(SDValue Op, SelectionDAG &DAG) const;
   SDValue WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue WidenHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
   SDValue LegalizeHvxResize(SDValue Op, SelectionDAG &DAG) const;
   SDValue ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG) const;
   SDValue EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 212a57bc7cde5..981521c9b1f9b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -466,6 +466,7 @@ HexagonTargetLowering::initializeHVXLowering() {
         setOperationAction(ISD::ANY_EXTEND,   VecTy, Custom);
         setOperationAction(ISD::SIGN_EXTEND,  VecTy, Custom);
         setOperationAction(ISD::ZERO_EXTEND,  VecTy, Custom);
+        setOperationAction(ISD::INTRINSIC_WO_CHAIN, VecTy, Custom);
         if (Subtarget.useHVXFloatingPoint()) {
           setOperationAction(ISD::FP_TO_SINT,   VecTy, Custom);
           setOperationAction(ISD::FP_TO_UINT,   VecTy, Custom);
@@ -3403,6 +3404,104 @@ HexagonTargetLowering::WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
                      {SetCC, getZero(dl, MVT::i32, DAG)});
 }
 
+SDValue HexagonTargetLowering::WidenHvxIntrinsic(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned HwWidth = 8 * Subtarget.getVectorLength();
+  bool IsResInterleaved = false;
+
+  SDValue WideRes = SDValue();
+  SDValue Op1 = Op.getOperand(1);
+  MVT ResTy = ty(Op);
+  MVT OpTy = ty(Op1);
+  if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
+    return SDValue();
+
+  auto getFactor = [HwWidth](MVT Ty) {
+    unsigned Width = Ty.getSizeInBits();
+    assert(HwWidth % Width == 0);
+    return HwWidth / Width;
+  };
+
+  auto getWideTy = [getFactor](MVT Ty) {
+    unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
+    return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
+  };
+
+  unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue Op2 = Op.getOperand(2);
+  SDValue WideOp1 = appendUndef(Op1, getWideTy(OpTy), DAG);
+  SDValue WideOp2;
+  if (dyn_cast<const ConstantSDNode>(Op2.getNode())) {
+    WideOp2 = Op2;
+  } else {
+    WideOp2 = appendUndef(Op2, getWideTy(OpTy), DAG);
+  }
+  unsigned WidenFactor = getFactor(OpTy);
+  unsigned WideLen = ResTy.getVectorNumElements() * WidenFactor;
+  MVT WideResTy = MVT::getVectorVT(ResTy.getVectorElementType(), WideLen);
+
+  switch (IID) {
+  default:
+    return SDValue();
+  case Intrinsic::hexagon_vasrsat_su:
+  case Intrinsic::hexagon_vasrsat_uu:
+  case Intrinsic::hexagon_vasrsat_ss:
+    WideRes = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WideResTy,
+                          DAG.getConstant(IID, dl, MVT::i32), WideOp1, WideOp2,
+                          Op.getOperand(3));
+    break;
+  case Intrinsic::hexagon_vadd_su:
+  case Intrinsic::hexagon_vadd_uu:
+  case Intrinsic::hexagon_vadd_ss:
+  case Intrinsic::hexagon_vadd_us:
+
+  case Intrinsic::hexagon_vsub_su:
+  case Intrinsic::hexagon_vsub_uu:
+  case Intrinsic::hexagon_vsub_ss:
+  case Intrinsic::hexagon_vsub_us:
+
+  case Intrinsic::hexagon_vmpy_su:
+  case Intrinsic::hexagon_vmpy_uu:
+  case Intrinsic::hexagon_vmpy_ss:
+  case Intrinsic::hexagon_vmpy_us:
+  case Intrinsic::hexagon_vmpy_ub_ub:
+  case Intrinsic::hexagon_vmpy_ub_b:
+  case Intrinsic::hexagon_vmpy_uh_uh:
+  case Intrinsic::hexagon_vmpy_h_h:
+    IsResInterleaved = true;
+    WideRes = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WideResTy,
+                          DAG.getConstant(IID, dl, MVT::i32), WideOp1, WideOp2);
+    break;
+  case Intrinsic::hexagon_vavgu:
+  case Intrinsic::hexagon_vavgs:
+    WideRes = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WideResTy,
+                          DAG.getConstant(IID, dl, MVT::i32), WideOp1, WideOp2);
+    break;
+  }
+  unsigned OrigLen = ResTy.getVectorNumElements();
+  assert(OrigLen % 2 == 0);
+  unsigned HalfOrigLen = OrigLen / 2;
+  unsigned SplitLen = WideLen / 2;
+  if (IsResInterleaved) {
+    // Get the valid odd and even elements from the widened vector-pair while
+    // maintaining their deinterleaved order. The following shuffle_vector will
+    // produce a vector-pair with all the valid elements (even followed by odd)
+    // accumulated together followed by undefs.
+    SmallVector<int, 128> ShuffV;
+    for (unsigned j = 0; j < WidenFactor; j++) {
+      for (unsigned i = 0; i < HalfOrigLen; i++)
+        ShuffV.push_back(j * HalfOrigLen + i);
+      for (unsigned i = 0; i < HalfOrigLen; i++)
+        ShuffV.push_back(SplitLen + j * HalfOrigLen + i);
+    }
+    WideRes = DAG.getVectorShuffle(WideResTy, dl, WideRes,
+                                   DAG.getUNDEF(WideResTy), ShuffV);
+  }
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResTy,
+                     {WideRes, getZero(dl, MVT::i32, DAG)});
+}
+
 SDValue
 HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
@@ -3669,6 +3768,12 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
         Results.push_back(S);
       }
       break;
+    case ISD::INTRINSIC_WO_CHAIN:
+      if (shouldWidenToHvx(ty(Op.getOperand(1)), DAG)) {
+        if (SDValue T = WidenHvxIntrinsic(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:
     case ISD::FP_TO_SINT:
@@ -3729,6 +3834,11 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
         Results.push_back(C);
       }
       break;
+    case ISD::INTRINSIC_WO_CHAIN:
+      assert(shouldWidenToHvx(ty(N->getOperand(1)), DAG) && "Not widening?");
+      if (SDValue T = WidenHvxIntrinsic(Op, DAG))
+        Results.push_back(T);
+      break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
       if (ty(Op).getSizeInBits() != ty(Inp0).getSizeInBits()) {
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
index 25b81d8cd21ff..7f16c3e231d09 100644
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -355,6 +355,120 @@ defm : T_VVI_inv_pat <V6_valignbi, int_hexagon_V6_vlalignbi>;
 defm : T_VVI_inv_pat <V6_valignbi, int_hexagon_V6_vlalignb>;
 defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignbi>;
 
+
+class VAccGenIntrin_pat<InstHexagon MI, Intrinsic IntID,
+                        ValueType ResType, PatFrag VPred, PatFrag WPred>
+  : Pat<(add WPred:$Vx, (ResType (IntID VPred:$Vs, VPred:$Vt))),
+        (MI WPred:$Vx, VPred:$Vs, VPred:$Vt)>, Requires<[UseHVX128B]>;
+
+let AddedComplexity = 100 in {
+  def : VAccGenIntrin_pat<V6_vmpybv_acc,
+                          int_hexagon_vmpy_ss, VecPI16, HVI8, HWI16>;
+  def : VAccGenIntrin_pat<V6_vmpyubv_acc,
+                          int_hexagon_vmpy_uu, VecPI16, HVI8, HWI16>;
+  def : VAccGenIntrin_pat<V6_vmpyhv_acc,
+                          int_hexagon_vmpy_ss, VecPI32, HVI16, HWI32>;
+  def : VAccGenIntrin_pat<V6_vmpyuhv_acc,
+                          int_hexagon_vmpy_uu, VecPI32, HVI16, HWI32>;
+
+  // The second operand in V6_vmpybusv_acc is unsigned.
+  def : Pat<(add HWI16:$Vx, (VecPI16 (int_hexagon_vmpy_us HVI8:$Vs,
+                                                               HVI8:$Vv))),
+            (V6_vmpybusv_acc HWI16:$Vx, HVI8:$Vs, HVI8:$Vv)>;
+
+  def : Pat<(add HWI16:$Vx, (VecPI16 (int_hexagon_vmpy_su HVI8:$Vs,
+                                                               HVI8:$Vv))),
+            (V6_vmpybusv_acc HWI16:$Vx, HVI8:$Vv, HVI8:$Vs)>;
+
+  // The third operand in V6_vmpyhus_acc is unsigned.
+  def : Pat<(add HWI32:$Vx, (VecPI32 (int_hexagon_vmpy_us HVI16:$Vs,
+                                                               HVI16:$Vv))),
+            (V6_vmpyhus_acc HWI32:$Vx, HVI16:$Vv, HVI16:$Vs)>;
+
+  def : Pat<(add HWI32:$Vx, (VecPI32 (int_hexagon_vmpy_su HVI16:$Vs,
+                                                               HVI16:$Vv))),
+            (V6_vmpyhus_acc HWI32:$Vx, HVI16:$Vs, HVI16:$Vv)>;
+}
+
+class ExtIntrin_pat<InstHexagon MI, Intrinsic IntID,
+                    ValueType ResType, PatFrag VPred>
+  : Pat<(ResType (IntID VPred:$Vs, VPred:$Vt)),
+        (MI VPred:$Vs, VPred:$Vt)>, Requires<[UseHVX128B]>;
+
+def : ExtIntrin_pat<V6_vaddubh, int_hexagon_vadd_uu, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vadduhw, int_hexagon_vadd_uu, VecPI32, HVI16>;
+def : ExtIntrin_pat<V6_vaddhw, int_hexagon_vadd_ss, VecPI32, HVI16>;
+
+def : ExtIntrin_pat<V6_vsububh, int_hexagon_vsub_uu, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vsubuhw, int_hexagon_vsub_uu, VecPI32, HVI16>;
+def : ExtIntrin_pat<V6_vsubhw, int_hexagon_vsub_ss, VecPI32, HVI16>;
+
+def : ExtIntrin_pat<V6_vmpybv, int_hexagon_vmpy_ss, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vmpyhv, int_hexagon_vmpy_ss, VecPI32, HVI16>;
+def : ExtIntrin_pat<V6_vmpyubv, int_hexagon_vmpy_uu, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vmpyuhv, int_hexagon_vmpy_uu, VecPI32, HVI16>;
+
+// The first operand in V6_vmpybusv is unsigned.
+def : Pat<(VecPI16 (int_hexagon_vmpy_us HVI8:$Vs, HVI8:$Vv)),
+          (V6_vmpybusv HVI8:$Vs, HVI8:$Vv)>;
+
+def : Pat<(VecPI16 (int_hexagon_vmpy_su HVI8:$Vs, HVI8:$Vv)),
+          (V6_vmpybusv HVI8:$Vv, HVI8:$Vs)>;
+
+// The second operand in V6_vmpyhus is unsigned.
+def : Pat<(VecPI32 (int_hexagon_vmpy_us HVI16:$Vs, HVI16:$Vv)),
+          (V6_vmpyhus HVI16:$Vv, HVI16:$Vs)>;
+
+def : Pat<(VecPI32 (int_hexagon_vmpy_su HVI16:$Vs, HVI16:$Vv)),
+          (V6_vmpyhus HVI16:$Vs, HVI16:$Vv)>;
+
+class VAvgInstr_pat<InstHexagon MI, Intrinsic IntID,
+                       ValueType ResType, PatFrag VPred>
+  : Pat<(ResType (IntID VPred:$Vs, VPred:$Vt)),
+        (MI VPred:$Vs, VPred:$Vt)>, Requires<[UseHVX128B]>;
+
+def : VAvgInstr_pat<V6_vavgub, int_hexagon_vavgu, VecI8, HVI8>;
+def : VAvgInstr_pat<V6_vavgb, int_hexagon_vavgs, VecI8, HVI8>;
+def : VAvgInstr_pat<V6_vavguh, int_hexagon_vavgu, VecI16, HVI16>;
+def : VAvgInstr_pat<V6_vavgh, int_hexagon_vavgs, VecI16, HVI16>;
+def : VAvgInstr_pat<V6_vavguw, int_hexagon_vavgu, VecI32, HVI32>;
+def : VAvgInstr_pat<V6_vavgw, int_hexagon_vavgs, VecI32, HVI32>;
+
+class VAsrIntr_pat<InstHexagon MI, Intrinsic IntID,
+                   ValueType ResType, PatFrag VPred>
+: Pat<(ResType (IntID VPred:$Vs, VPred:$Vt, IntRegsLow8:$Rt)),
+      (MI VPred:$Vs, VPred:$Vt, IntRegsLow8:$Rt)>, Requires<[UseHVX128B]>;
+
+def : VAsrIntr_pat<V6_vasruhubsat, int_hexagon_vasrsat_uu, VecI8, HVI16>;
+def : VAsrIntr_pat<V6_vasrhubsat, int_hexagon_vasrsat_su, VecI8, HVI16>;
+def : VAsrIntr_pat<V6_vasrhbsat, int_hexagon_vasrsat_ss, VecI8, HVI16>;
+def : VAsrIntr_pat<V6_vasruwuhsat, int_hexagon_vasrsat_uu, VecI16, HVI32>;
+def : VAsrIntr_pat<V6_vasrwuhsat, int_hexagon_vasrsat_su, VecI16, HVI32>;
+def : VAsrIntr_pat<V6_vasrwhsat, int_hexagon_vasrsat_ss, VecI16, HVI32>;
+
+class VMpyVSInstr_pat<InstHexagon MI, Intrinsic IntID,
+                   ValueType ResType, PatFrag VPred>
+: Pat<(ResType (IntID VPred:$Vs, IntRegs:$Rt)),
+      (MI VPred:$Vs, IntRegs:$Rt)>, Requires<[UseHVX128B]>;
+
+def : VMpyVSInstr_pat<V6_vmpyub, int_hexagon_vmpy_ub_ub, VecPI16, HVI8>;
+def : VMpyVSInstr_pat<V6_vmpybus, int_hexagon_vmpy_ub_b, VecPI16, HVI8>;
+def : VMpyVSInstr_pat<V6_vmpyuh, int_hexagon_vmpy_uh_uh, VecPI32, HVI16>;
+def : VMpyVSInstr_pat<V6_vmpyh, int_hexagon_vmpy_h_h, VecPI32, HVI16>;
+
+class VAccIntrin_pat<InstHexagon MI, Intrinsic IntID>
+  : Pat<(add HvxWR:$Vx, (IntID HvxVR:$Vs, HvxVR:$Vt)),
+        (MI HvxWR:$Vx, HvxVR:$Vs, HvxVR:$Vt)>, Requires<[UseHVX128B]>;
+
+let AddedComplexity = 350 in {
+  def : VAccIntrin_pat<V6_vmpybv_acc, int_hexagon_V6_vmpybv_128B>;
+  def : VAccIntrin_pat<V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_128B>;
+  def : VAccIntrin_pat<V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_128B>;
+  def : VAccIntrin_pat<V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_128B>;
+  def : VAccIntrin_pat<V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_128B>;
+  def : VAccIntrin_pat<V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_128B>;
+}
+
 def: Pat<(int_hexagon_V6_vd0),
          (V6_vd0)>, Requires<[UseHVXV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vd0_128B ),
diff --git a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 5a1d5bc669169..c68b63205fbbf 100644
--- a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -138,7 +138,7 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
     return false;
 
   // Make sure that the (unique) def operand is a register from IntRegs.
-  bool HadDef = false;
+  [[maybe_unused]] bool HadDef = false;
   for (const MachineOperand &Op : II->operands()) {
     if (!Op.isReg() || !Op.isDef())
       continue;
diff --git a/llvm/lib/Target/Hexagon/HexagonOptShuffleVector.cpp b/llvm/lib/Target/Hexagon/HexagonOptShuffleVector.cpp
new file mode 100644
index 0000000000000..fcfae1776ecec
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonOptShuffleVector.cpp
@@ -0,0 +1,713 @@
+//===---------------------- HexagonOptShuffleVector.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Optimize vector shuffles by postponing them as late as possible. The intent
+// here is to remove uncessary shuffles and also increases the oportunities for
+// adjacent shuffles to be merged together.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "hex-shuff-vec"
+/// A command line argument to limit the search space along def chain.
+static cl::opt<int> MaxDefSearchCount(
+    "shuffvec-max-search-count",
+    cl::desc("Maximum number of instructions traversed along def chain."),
+    cl::Hidden, cl::init(15));
+
+#ifndef NDEBUG
+static cl::opt<int>
+    ShuffVecLimit("shuff-vec-max",
+                  cl::desc("Maximum number of shuffles to be relocated."),
+                  cl::Hidden, cl::init(-1));
+#endif
+
+namespace llvm {
+void initializeHexagonOptShuffleVectorPass(PassRegistry &);
+FunctionPass *createHexagonOptShuffleVector(const HexagonTargetMachine &);
+} // end namespace llvm
+
+namespace {
+
+class HexagonOptShuffleVector : public FunctionPass {
+public:
+  static char ID;
+#ifndef NDEBUG
+  static int NumRelocated;
+#endif
+  HexagonOptShuffleVector() : FunctionPass(ID) {
+    initializeHexagonOptShuffleVectorPass(*PassRegistry::getPassRegistry());
+  }
+
+  HexagonOptShuffleVector(const HexagonTargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {
+    initializeHexagonOptShuffleVectorPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon Optimize Vector Shuffles";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  using ValueVector = SmallVector<Value *, 8>;
+  const HexagonTargetMachine *TM = nullptr;
+  const HexagonSubtarget *HST = nullptr;
+  SmallPtrSet<Instruction *, 8> Visited;
+  using ShuffUseList =
+      SmallDenseMap<Instruction *, SmallVector<Instruction *, 2>>;
+  ShuffUseList ShuffUses;
+  int DefSearchCount;
+
+  bool visitBlock(BasicBlock *B);
+  bool findNewShuffLoc(Instruction *I, ArrayRef<int> &ShuffMask,
+                       Value *&NewLoc);
+  bool isValidIntrinsic(IntrinsicInst *I);
+  bool relocateShuffVec(Instruction *I, ArrayRef<int> &M, Value *NewLoc,
+                        std::list<Instruction *> &WorkList);
+  bool getUseList(Instruction *I, ValueVector &UseList);
+  bool analyzeHiLoUse(Instruction *HI, Instruction *LO,
+                      ArrayRef<int> &ShuffMask, Value *&NewLoc,
+                      ShuffUseList &CurShuffUses);
+  bool isHILo(Value *V, bool IsHI);
+  bool hasDefWithSameShuffMask(Value *V, SmallVector<Instruction *, 2> &ImmUse,
+                               ArrayRef<int> &ShuffMask,
+                               ShuffUseList &CurShuffUses);
+  void FindHiLoUse(ValueVector &UseList, Instruction *&HI, Instruction *&LO);
+  bool isConcatMask(ArrayRef<int> &Mask, Instruction *ShuffInst);
+  bool isValidUseInstr(ValueVector &UseList, Instruction *&UI);
+  bool areAllOperandsValid(Instruction *I, Instruction *UI,
+                           ArrayRef<int> &ShuffMask,
+                           ShuffUseList &CurShuffUses);
+  Value *getOperand(Instruction *I, unsigned i);
+  static iterator_range<User::op_iterator> getArgOperands(User *U);
+  static std::pair<Value *, Value *> stripCasts(Value *V);
+  static bool isConstantVectorSplat(Value *V);
+};
+
+} // end anonymous namespace
+
+#ifndef NDEBUG
+int HexagonOptShuffleVector::NumRelocated = 0;
+#endif
+char HexagonOptShuffleVector::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonOptShuffleVector, "shuff-vec",
+                      "Hexagon Optimize Shuffle Vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonOptShuffleVector, "shuff-vec",
+                    "Hexagon Optimize Shuffle Vector", false, false)
+
+bool HexagonOptShuffleVector::isConcatMask(ArrayRef<int> &Mask,
+                                           Instruction *ShuffInst) {
+  Type *ShuffTy = ShuffInst->getType();
+  int NumElts = cast<FixedVectorType>(ShuffTy)->getNumElements();
+  for (int i = 0; i < NumElts; i++) {
+    if (Mask[i] != i)
+      return false;
+  }
+  return true;
+}
+
+bool HexagonOptShuffleVector::isValidIntrinsic(IntrinsicInst *I) {
+  switch (I->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::hexagon_V6_vaddubh_128B:
+  case Intrinsic::hexagon_V6_vadduhw_128B:
+  case Intrinsic::hexagon_V6_vaddhw_128B:
+  case Intrinsic::hexagon_V6_vaddh_dv_128B:
+  case Intrinsic::hexagon_V6_vsububh_128B:
+  case Intrinsic::hexagon_V6_vsubuhw_128B:
+  case Intrinsic::hexagon_V6_vsubhw_128B:
+  case Intrinsic::hexagon_V6_vsubh_dv_128B:
+  case Intrinsic::hexagon_V6_vmpyubv_128B:
+  case Intrinsic::hexagon_V6_vmpybv_128B:
+  case Intrinsic::hexagon_V6_vmpyuhv_128B:
+  case Intrinsic::hexagon_V6_vmpyhv_128B:
+  case Intrinsic::hexagon_V6_vmpybusv_128B:
+  case Intrinsic::hexagon_V6_vmpyhus_128B:
+  case Intrinsic::hexagon_V6_vavgb_128B:
+  case Intrinsic::hexagon_V6_vavgub_128B:
+  case Intrinsic::hexagon_V6_vavgh_128B:
+  case Intrinsic::hexagon_V6_vavguh_128B:
+  case Intrinsic::hexagon_V6_vavgw_128B:
+  case Intrinsic::hexagon_V6_vavguw_128B:
+  case Intrinsic::hexagon_V6_hi_128B:
+  case Intrinsic::hexagon_V6_lo_128B:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::uadd_sat:
+  // Generic hexagon vector intrinsics
+  case Intrinsic::hexagon_vadd_su:
+  case Intrinsic::hexagon_vadd_uu:
+  case Intrinsic::hexagon_vadd_ss:
+  case Intrinsic::hexagon_vadd_us:
+  case Intrinsic::hexagon_vsub_su:
+  case Intrinsic::hexagon_vsub_uu:
+  case Intrinsic::hexagon_vsub_ss:
+  case Intrinsic::hexagon_vsub_us:
+  case Intrinsic::hexagon_vmpy_su:
+  case Intrinsic::hexagon_vmpy_uu:
+  case Intrinsic::hexagon_vmpy_ss:
+  case Intrinsic::hexagon_vmpy_us:
+  case Intrinsic::hexagon_vavgu:
+  case Intrinsic::hexagon_vavgs:
+  case Intrinsic::hexagon_vmpy_ub_b:
+  case Intrinsic::hexagon_vmpy_ub_ub:
+  case Intrinsic::hexagon_vmpy_uh_uh:
+  case Intrinsic::hexagon_vmpy_h_h:
+    return true;
+  }
+  llvm_unreachable("Unsupported instruction!");
+}
+
+bool HexagonOptShuffleVector::getUseList(Instruction *I, ValueVector &UseList) {
+  for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
+    Instruction *J = dyn_cast<Instruction>(*UI);
+    if (!J)
+      return false;
+    if (auto *C = dyn_cast<CastInst>(*UI)) {
+      if (!getUseList(C, UseList))
+        return false;
+    } else
+      UseList.push_back(*UI);
+    ++UI;
+  }
+  return true;
+}
+
+bool HexagonOptShuffleVector::isHILo(Value *V, bool IsHI) {
+  if (!(dyn_cast<Instruction>(V)))
+    return false;
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!isa<CallInst>(I))
+    return false;
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  if ((II->getIntrinsicID() == Intrinsic::hexagon_V6_hi_128B && IsHI) ||
+      (II->getIntrinsicID() == Intrinsic::hexagon_V6_lo_128B && !IsHI))
+    return true;
+  return false;
+}
+
+Value *HexagonOptShuffleVector::getOperand(Instruction *I, unsigned i) {
+  Value *V = I->getOperand(i);
+  if (auto *C = dyn_cast<CastInst>(V))
+    return C->getOperand(0);
+  return V;
+}
+
+iterator_range<User::op_iterator>
+HexagonOptShuffleVector::getArgOperands(User *U) {
+  if (auto *CB = dyn_cast<CallBase>(U))
+    return CB->args();
+  return U->operands();
+}
+
+// Strip out all the cast operations to find the first non-cast definition of a
+// value. The function also returns the last cast operation in the def-chain.
+std::pair<Value *, Value *> HexagonOptShuffleVector::stripCasts(Value *V) {
+  Value *LastCast = nullptr;
+  while (auto *C = dyn_cast<CastInst>(V)) {
+    LastCast = V;
+    V = C->getOperand(0);
+  }
+  return std::make_pair(V, LastCast);
+}
+
+bool HexagonOptShuffleVector::isConstantVectorSplat(Value *V) {
+  if (auto *CV = dyn_cast<ConstantVector>(V))
+    return CV->getSplatValue();
+  if (auto *CV = dyn_cast<ConstantDataVector>(V))
+    return CV->isSplat();
+  return false;
+}
+
+// Make sure all the operations on HI and LO counterparts are identical
+// until both halves are merged together. When a merge point (concat)
+// is found, set it as 'NewLoc' and return.
+bool HexagonOptShuffleVector::analyzeHiLoUse(Instruction *HI, Instruction *LO,
+                                             ArrayRef<int> &ShuffMask,
+                                             Value *&NewLoc,
+                                             ShuffUseList &CurShuffUses) {
+  ValueVector HiUseList, LoUseList;
+  getUseList(HI, HiUseList);
+  getUseList(LO, LoUseList);
+
+  // To keep the analsis simple, only handle Hi and Lo with a single use. Also,
+  // not even sure at this point if it will be profitable due to multiple
+  // merge points.
+  if (HiUseList.size() != 1 || LoUseList.size() != 1)
+    return false;
+
+  Instruction *HiUse = dyn_cast<Instruction>(HiUseList[0]);
+  Instruction *LoUse = dyn_cast<Instruction>(LoUseList[0]);
+  if (!HiUse || !LoUse)
+    return false;
+
+  bool IsUseIntrinsic = false;
+  if (isa<CallInst>(HiUse)) {
+    if (!isa<CallInst>(LoUse))
+      return false;
+    // Continue only if both Hi and Lo uses are calls to the same intrinsic.
+    IntrinsicInst *HiUseII = dyn_cast<IntrinsicInst>(HiUse);
+    IntrinsicInst *LoUseII = dyn_cast<IntrinsicInst>(LoUse);
+    if (!HiUseII || !LoUseII ||
+        HiUseII->getIntrinsicID() != LoUseII->getIntrinsicID() ||
+        !isValidIntrinsic(HiUseII))
+      return false;
+    IsUseIntrinsic = true;
+    HiUse = HiUseII;
+    LoUse = LoUseII;
+  }
+  if (HiUse->getOpcode() != LoUse->getOpcode())
+    return false;
+
+  // If both Hi and Lo use are same and is a concat operation, set it
+  // as a 'NewLoc'.
+  if (HiUse == LoUse) {
+    // Return true if use is a concat of Hi and Lo.
+    ArrayRef<int> M;
+    if (match(HiUse, (m_Shuffle(m_Value(), m_Value(), m_Mask(M))))) {
+      if (isConcatMask(M, HiUse)) {
+        NewLoc = HiUse;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Check if HiUse and LoUse are shuffles with the same mask. If so, safe to
+  // continue the search.
+  ArrayRef<int> M1, M2;
+  if (match(HiUse, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M1)))) &&
+      match(LoUse, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M2)))) &&
+      M1.equals(M2))
+    return analyzeHiLoUse(HiUse, LoUse, ShuffMask, NewLoc, CurShuffUses);
+
+  // For now, only handling binary ops and some of the instrinsics
+  // which appear to be safe (hardcoded in isValidIntrinsic()).
+  if (!HiUse->isBinaryOp() && !IsUseIntrinsic)
+    return false;
+
+  ValueVector HiUseOperands, LoUseOperands;
+  int HiOpNum = -1, LoOpNum = -1;
+  for (unsigned i = 0; i < HiUse->getNumOperands(); i++) {
+    Value *V = getOperand(HiUse, i);
+    if (V == HI)
+      HiOpNum = i;
+    else
+      HiUseOperands.push_back(V);
+  }
+  for (unsigned i = 0; i < LoUse->getNumOperands(); i++) {
+    Value *V = getOperand(LoUse, i);
+    if (V == LO)
+      LoOpNum = i;
+    else
+      LoUseOperands.push_back(V);
+  }
+
+  // Enforcing strict ordering which is not necessary in case of
+  // commutative operations and may be relaxed in future if needed.
+  if (HiOpNum < 0 || HiOpNum != LoOpNum ||
+      LoUseOperands.size() != HiUseOperands.size())
+    return false;
+
+  unsigned NumOperands = HiUseOperands.size();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    if (HiUseOperands[i] == LoUseOperands[i])
+      continue;
+    // Only handle the case where other operands to Hi and Lo uses
+    // are comming from another Hi and Lo pair.
+    if (!isHILo(HiUseOperands[i], true) || !isHILo(LoUseOperands[i], false))
+      return false;
+
+    Value *DefHiUse = dyn_cast<Instruction>(HiUseOperands[i])->getOperand(0);
+    Value *DefLoUse = dyn_cast<Instruction>(LoUseOperands[i])->getOperand(0);
+    if (!DefHiUse || DefHiUse != DefLoUse)
+      return false;
+    SmallVector<Instruction *, 2> ImmUseList;
+    if (dyn_cast<CastInst>(DefHiUse))
+      ImmUseList.push_back(dyn_cast<Instruction>(DefHiUse));
+    else {
+      ImmUseList.push_back(HiUse);
+      ImmUseList.push_back(LoUse);
+    }
+
+    // Make sure that the Hi/Lo def has the same shuffle mask.
+    if (!hasDefWithSameShuffMask(DefHiUse, ImmUseList, ShuffMask, CurShuffUses))
+      return false;
+  }
+
+  // Continue the search along Hi/Lo use-chain.
+  return analyzeHiLoUse(HiUse, LoUse, ShuffMask, NewLoc, CurShuffUses);
+}
+
+bool HexagonOptShuffleVector::hasDefWithSameShuffMask(
+    Value *V, SmallVector<Instruction *, 2> &ImmUses, ArrayRef<int> &ShuffMask,
+    ShuffUseList &CurShuffUses) {
+  // Follow def-chain until we have found a shuffle_vector or have run out
+  // of max number of attempts.
+  if (DefSearchCount >= MaxDefSearchCount)
+    return false;
+
+  ++DefSearchCount;
+  V = stripCasts(V).first;
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+  bool Found = true;
+  ArrayRef<int> M;
+  if (match(V, (m_Shuffle(m_Value(), m_Value(), m_Mask(M)))) &&
+      M.equals(ShuffMask)) {
+    CurShuffUses[I] = ImmUses;
+    return true;
+  }
+  if ((match(V, m_Shuffle(m_InsertElt(m_Poison(), m_Value(), m_Zero()),
+                          m_Poison(), m_ZeroMask()))))
+    return true; // scalar converted to a vector
+
+  auto *II = dyn_cast<IntrinsicInst>(I);
+  if (!I->isBinaryOp() && (!II || !isValidIntrinsic(II)))
+    return false;
+
+  for (Value *OpV : getArgOperands(I)) {
+    std::pair<Value *, Value *> P = stripCasts(OpV);
+    OpV = P.first;
+
+    SmallVector<Instruction *, 2> ImmUseList;
+    if (P.second)
+      ImmUseList.push_back(dyn_cast<Instruction>(P.second));
+    else
+      ImmUseList.push_back(dyn_cast<Instruction>(I));
+
+    if (isa<PoisonValue>(OpV))
+      continue;
+    if (isConstantVectorSplat(OpV))
+      continue;
+    if (!dyn_cast<Instruction>(OpV))
+      return false;
+    if ((match(OpV, m_Shuffle(m_InsertElt(m_Poison(), m_Value(), m_Zero()),
+                              m_Poison(), m_ZeroMask()))))
+      continue;
+    Found &= hasDefWithSameShuffMask(OpV, ImmUseList, ShuffMask, CurShuffUses);
+  }
+  return Found;
+}
+
+void HexagonOptShuffleVector::FindHiLoUse(ValueVector &UseList,
+                                          Instruction *&HI, Instruction *&LO) {
+
+  for (unsigned i = 0; i < UseList.size(); i++) {
+    auto *J = dyn_cast<Instruction>(UseList[i]);
+    auto *CI = dyn_cast<CallInst>(J);
+    if (CI) {
+      auto *II = dyn_cast<IntrinsicInst>(CI);
+      if (II) {
+        Intrinsic::ID IntID = II->getIntrinsicID();
+        if (IntID == Intrinsic::hexagon_V6_hi_128B)
+          HI = J;
+        if (IntID == Intrinsic::hexagon_V6_lo_128B)
+          LO = J;
+      }
+    }
+  }
+}
+
+bool HexagonOptShuffleVector::isValidUseInstr(ValueVector &UseList,
+                                              Instruction *&UI) {
+  // Don't allow multiple uses. Only done in case of a Hi/Lo pair.
+  if (UseList.size() != 1)
+    return false;
+  UI = dyn_cast<Instruction>(UseList[0]);
+  if (!UI)
+    return false;
+  // Should be either a binary op or one of the supported instrinsics.
+  if (auto *CI = dyn_cast<CallInst>(UI)) {
+    auto *II = dyn_cast<IntrinsicInst>(CI);
+    if (!II || !isValidIntrinsic(II))
+      return false;
+    UI = II;
+  } else if (!UI->isBinaryOp())
+    return false;
+  return true;
+}
+
+// Check all the operands of 'Use' to make sure that they are either:
+// 1) a constant
+// 2) a scalar
+// 3) a constant vector
+// 4) a vector using the same mask as I
+bool HexagonOptShuffleVector::areAllOperandsValid(Instruction *I,
+                                                  Instruction *Use,
+                                                  ArrayRef<int> &ShuffMask,
+                                                  ShuffUseList &CurShuffUses) {
+  bool AllOperandsOK = true;
+  for (Value *OpV : getArgOperands(Use)) {
+    bool HasOneUse = OpV->hasOneUse();
+    std::pair<Value *, Value *> P = stripCasts(OpV);
+    OpV = P.first;
+
+    SmallVector<Instruction *, 2> ImmUseList;
+    if (P.second)
+      ImmUseList.push_back(dyn_cast<Instruction>(P.second));
+    else
+      ImmUseList.push_back(dyn_cast<Instruction>(Use));
+
+    if (OpV == I || isa<PoisonValue>(OpV))
+      continue;
+    if (isConstantVectorSplat(OpV))
+      continue;
+    if (!dyn_cast<Instruction>(OpV) || !HasOneUse)
+      return false;
+
+    if ((match(OpV, m_Shuffle(m_InsertElt(m_Poison(), m_Value(), m_Zero()),
+                              m_Poison(), m_ZeroMask()))))
+      continue;
+    AllOperandsOK &=
+        hasDefWithSameShuffMask(OpV, ImmUseList, ShuffMask, CurShuffUses);
+  }
+  return AllOperandsOK;
+}
+
+// Find the new location where it's safe to relocate shuffle instruction 'I'.
+bool HexagonOptShuffleVector::findNewShuffLoc(Instruction *I,
+                                              ArrayRef<int> &ShuffMask,
+                                              Value *&NewLoc) {
+  DefSearchCount = 0;
+  ValueVector UseList;
+  if (!getUseList(I, UseList))
+    return false;
+
+  using ShuffUseList =
+      SmallDenseMap<Instruction *, SmallVector<Instruction *, 2>>;
+  ShuffUseList CurShuffUses;
+  // Check for Hi and Lo pair.
+  Instruction *HI = nullptr, *LO = nullptr;
+  FindHiLoUse(UseList, HI, LO);
+  if (UseList.size() == 2 && HI && LO) {
+    // If 'I' has Hi and Lo use-pair, then it can be relocated only after Hi/Lo
+    // use-chain's merge point, i.e., after a concat vector provided it's safe
+    // to do so.
+    LLVM_DEBUG({
+      dbgs() << "\tFollowing the Hi/LO pair :\n";
+      dbgs() << "\t\tHI - ";
+      HI->dump();
+      dbgs() << "\t\tLO - ";
+      LO->dump();
+    });
+    if (!analyzeHiLoUse(HI, LO, ShuffMask, NewLoc, CurShuffUses))
+      return false;
+    for (auto &it : CurShuffUses)
+      ShuffUses[it.first] = it.second;
+    return true;
+  } else { // Single use case
+    Instruction *UI = nullptr;
+    if (!isValidUseInstr(UseList, UI))
+      return false;
+    assert(UI && "Expected a valid use, but found none!!");
+
+    if (HI || LO) {
+      // If the single use case is either Hi or Lo, it is not safe to relocate
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "\tChecking operands in 'use' : \n\t\t"; UI->dump());
+    if (!areAllOperandsValid(I, UI, ShuffMask, CurShuffUses)) {
+      LLVM_DEBUG(dbgs() << "\t\tNOT SAFE -- Exiting!!\n");
+      return false;
+    }
+    for (auto &it : CurShuffUses)
+      ShuffUses[it.first] = it.second;
+    NewLoc = UI;
+    // Keep looking for the new location until can't proceed any longer.
+    findNewShuffLoc(UI, ShuffMask, NewLoc);
+  }
+  return true;
+}
+
+// Move shuffle instruction 'I' after 'NewLoc'.
+bool HexagonOptShuffleVector::relocateShuffVec(
+    Instruction *I, ArrayRef<int> &M, Value *NewLoc,
+    std::list<Instruction *> &WorkList) {
+  // Remove original vector shuffles at the input operands.
+  // However, it can be done only if the replacements have the
+  // same number of vector elements as the original operands.
+  std::map<Instruction *, Value *> InstrMap;
+  bool CanReplace = true;
+  unsigned ShuffInstCount = ShuffUses.size();
+  for (auto &it : ShuffUses) {
+    Instruction *J = it.first;
+    Visited.insert(J);
+    Value *ShuffleOP = nullptr;
+    match(J, (m_Shuffle(m_Value(ShuffleOP), m_Poison(), m_Mask(M))));
+    VectorType *JTy = cast<FixedVectorType>(J->getType());
+    VectorType *ShuffTy = cast<FixedVectorType>(ShuffleOP->getType());
+    if (JTy->getElementCount() != ShuffTy->getElementCount())
+      CanReplace = false;
+
+    // Relocate shufflevector after a wider instruction only if there are
+    // at least two or more shufflevectors being relocated in order for the
+    // relocation to be profitable as otherwise it will require more shuffles.
+    VectorType *NewShuffTy = cast<FixedVectorType>(NewLoc->getType());
+    if (ShuffInstCount == 1 &&
+        NewShuffTy->getElementType() > ShuffTy->getElementType())
+      CanReplace = false;
+    InstrMap[J] = ShuffleOP;
+  }
+  if (!CanReplace) {
+    LLVM_DEBUG(dbgs() << "\tRelocation FAILED!! \n");
+    return false;
+  }
+  for (auto IM : InstrMap) {
+    Instruction *J = IM.first;
+    assert(ShuffUses.count(J));
+    SmallVector<Instruction *, 2> Uses = ShuffUses[J];
+    if (Uses.size() > 0) {
+      for (auto *U : Uses)
+        U->replaceUsesOfWith(IM.first, IM.second);
+    } else
+      // This is the shuffle we started with, and we have already made sure
+      // that it has either single use or a HI/LO use pair. So, it's okay
+      // to replace all its uses with the input to the shuffle instruction.
+      IM.first->replaceAllUsesWith(IM.second);
+  }
+  // Shuffle the output of NewLoc based on the original mask.
+  Instruction *Pos = dyn_cast<Instruction>(NewLoc);
+  assert(Pos);
+  Pos = Pos->getNextNode();
+  IRBuilder<> IRB(Pos);
+  Value *NewShuffV =
+      IRB.CreateShuffleVector(NewLoc, PoisonValue::get(NewLoc->getType()), M);
+  Instruction *NewInst = dyn_cast<Instruction>(NewShuffV);
+  if (!NewInst) {
+    LLVM_DEBUG(dbgs() << "\tRelocation FAILED!! \n");
+    return false;
+  }
+  for (auto UI = NewLoc->user_begin(), UE = NewLoc->user_end(); UI != UE;) {
+    Use &TheUse = UI.getUse();
+    ++UI;
+    Instruction *J = dyn_cast<Instruction>(TheUse.getUser());
+    if (J && TheUse.getUser() != NewShuffV)
+      J->replaceUsesOfWith(NewLoc, NewShuffV);
+  }
+  WorkList.push_back(NewInst);
+  LLVM_DEBUG(dbgs() << "\tRelocation Successfull!! \n");
+  LLVM_DEBUG(dbgs() << "\tAdded to Worklist :\n"; NewInst->dump());
+  return true;
+}
+
+bool HexagonOptShuffleVector::visitBlock(BasicBlock *B) {
+  bool Changed = false;
+  ArrayRef<int> M;
+  std::list<Instruction *> WorkList;
+  LLVM_DEBUG(dbgs() << "Preparing worklist for BB:\n");
+  LLVM_DEBUG(B->dump());
+  for (auto &I : *B) {
+    if (match(&I, (m_Shuffle(m_Value(), m_Value(), m_ZeroMask()))))
+      continue; // Skip - building vector from a scalar
+    if (match(&I, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M))))) {
+      WorkList.push_back(&I);
+      LLVM_DEBUG(dbgs() << "\tAdded instr - "; I.dump());
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Processing worklist:\n");
+  while (!WorkList.empty()) {
+#ifndef NDEBUG
+    int Limit = ShuffVecLimit;
+    if (Limit >= 0) {
+      if (NumRelocated >= ShuffVecLimit) {
+        LLVM_DEBUG({
+          dbgs() << "Reached maximum limit!! \n";
+          dbgs() << "Can't process any more shuffles.... \n";
+        });
+        return Changed;
+      }
+    }
+#endif
+    Instruction *I = WorkList.front();
+    WorkList.pop_front();
+    LLVM_DEBUG(dbgs() << "\tProcessing instr - "; I->dump());
+    Value *NewLoc = nullptr;
+
+    // 'ShuffUses' is used to keep track of the vector shuffles that need to
+    // be relocated along with their immediate uses that are known to satisfy
+    // all the safety requirements of the relocation.
+    // NOTE: The shuffle instr 'I', where the analysis starts, doesn't have
+    // its immediate uses set in 'ShuffUses'. This can be done but isn't
+    // necessary. At this point, only shuffles with single use or a HI/LO pair
+    // are allowed. This is done mostly because those with the multiple uses
+    // aren't expected to be much profitable and can be extended in the future
+    // if necessary. For now, all the uses in such cases can be safely updated
+    // when the corresponding vector shuffle is relocated.
+
+    ShuffUses.clear();
+    ShuffUses[I] = SmallVector<Instruction *, 2>();
+    // Skip if node already visited.
+    if (!Visited.insert(I).second) {
+      LLVM_DEBUG(dbgs() << "\t\tSKIPPING - Already visited ...\n");
+      continue;
+    }
+    if (!match(I, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M))))) {
+      LLVM_DEBUG(dbgs() << "\t\tSKIPPING - Not a vector shuffle ...\n");
+      continue;
+    }
+    if (!findNewShuffLoc(I, M, NewLoc) || !NewLoc) {
+      LLVM_DEBUG(dbgs() << "\t\tSKIPPING - NewLoc not found ...\n");
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "\t\tRelocating after -- "; NewLoc->dump());
+    Changed |= relocateShuffVec(I, M, NewLoc, WorkList);
+#ifndef NDEBUG
+    NumRelocated++;
+#endif
+  }
+  return Changed;
+}
+
+bool HexagonOptShuffleVector::runOnFunction(Function &F) {
+  HST = TM->getSubtargetImpl(F);
+  // Works only for 128B mode but can be extended for 64B if needed.
+  if (skipFunction(F) || !HST->useHVX128BOps())
+    return false;
+
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= visitBlock(&B);
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createHexagonOptShuffleVector(const HexagonTargetMachine &TM) {
+  return new HexagonOptShuffleVector(&TM);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 4cb29e7f00317..674d19176a88b 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -261,6 +261,16 @@ let Predicates = [UseHVX] in {
   defm: NopCast_pat<VecPI16, VecPI32, HvxWR>;
 }
 
+let Predicates = [UseHVXV68] in {
+  defm: NopCast_pat<VecI8,   VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecF32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecPI8,  VecPF32,  HvxWR>;
+  defm: NopCast_pat<VecPI16,  VecPF32,  HvxWR>;
+  defm: NopCast_pat<VecPI32,  VecPF32,  HvxWR>;
+}
+
 let Predicates = [UseHVX, UseHVXFloatingPoint] in {
   defm: NopCast_pat<VecI8,   VecF16,  HvxVR>;
   defm: NopCast_pat<VecI8,   VecBF16, HvxVR>;
@@ -307,6 +317,8 @@ let Predicates = [UseHVX] in {
            (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
   def: Pat<(VecPI32 (concat_vectors HVI32:$Vs, HVI32:$Vt)),
            (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(VecPF32 (concat_vectors HVF32:$Vs, HVF32:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
 
   def: Pat<(VecQ8  (qcat HQ16:$Qs, HQ16:$Qt)), (Combineq $Qt, $Qs)>;
   def: Pat<(VecQ16 (qcat HQ32:$Qs, HQ32:$Qt)), (Combineq $Qt, $Qs)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index d9824a3154093..d98fe80f453ab 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -47,6 +47,14 @@ static cl::opt<bool>
     DisableHardwareLoops("disable-hexagon-hwloops", cl::Hidden,
                          cl::desc("Disable Hardware Loops for Hexagon target"));
 
+static cl::opt<bool>
+    EnableGenWideningVec("hexagon-widening-vectors", cl::init(true), cl::Hidden,
+                         cl::desc("Generate widening vector instructions"));
+
+static cl::opt<bool>
+    EnableOptShuffleVec("hexagon-opt-shuffvec", cl::init(true), cl::Hidden,
+                        cl::desc("Enable optimization of shuffle vectors"));
+
 static cl::opt<bool>
     DisableAModeOpt("disable-hexagon-amodeopt", cl::Hidden,
                     cl::desc("Disable Hexagon Addressing Mode Optimization"));
@@ -321,6 +329,8 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void HexagonPassConfig::addIRPasses() {
+  HexagonTargetMachine &HTM = getHexagonTargetMachine();
+
   TargetPassConfig::addIRPasses();
   bool NoOpt = (getOptLevel() == CodeGenOptLevel::None);
 
@@ -350,6 +360,13 @@ void HexagonPassConfig::addIRPasses() {
     // Replace certain combinations of shifts and ands with extracts.
     if (EnableGenExtract)
       addPass(createHexagonGenExtract());
+    if (EnableGenWideningVec) {
+      addPass(createHexagonGenWideningVecInstr(HTM));
+      addPass(createHexagonGenWideningVecFloatInstr(HTM));
+      addPass(createDeadCodeEliminationPass());
+    }
+    if (EnableOptShuffleVec)
+      addPass(createHexagonOptShuffleVector(HTM));
   }
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 5c50ec2425b7c..ce5431758b1c7 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -368,8 +368,8 @@ class AlignVectors {
   const HexagonVectorCombine &HVC;
 };
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                         const AlignVectors::AddrInfo &AI) {
   OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';
   OS << "Addr: " << *AI.Addr << '\n';
   OS << "Type: " << *AI.ValTy << '\n';
@@ -379,8 +379,8 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
   return OS;
 }
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                         const AlignVectors::MoveGroup &MG) {
   OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
   OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
   OS << "Main\n";
@@ -398,9 +398,8 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
   return OS;
 }
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS,
-                        const AlignVectors::ByteSpan::Block &B) {
+[[maybe_unused]] raw_ostream &
+operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
   OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
   if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
     OS << "(self:" << B.Seg.Val << ')';
@@ -412,8 +411,8 @@ raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                         const AlignVectors::ByteSpan &BS) {
   OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
   for (const AlignVectors::ByteSpan::Block &B : BS)
     OS << B << '\n';
@@ -2475,19 +2474,19 @@ Value *HvxIdioms::processVGather(Instruction &In) const {
     Dst->eraseFromParent();
   } else if (Qual == HvxIdioms::LLVM_Scatter) {
     // Gather feeds directly into scatter.
-    LLVM_DEBUG({
-      auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
-      assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
-      unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
-      unsigned DstElements = HVC.length(DstInpTy);
-      auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
-      assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
-      dbgs() << "  Gather feeds into scatter\n  Values to scatter : "
-             << *Dst->getOperand(0) << "\n";
-      dbgs() << "  Dst type(" << *DstInpTy << ") elements(" << DstElements
-             << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
-             << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
-    });
+    auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
+    assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
+    [[maybe_unused]] unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
+    [[maybe_unused]] unsigned DstElements = HVC.length(DstInpTy);
+    [[maybe_unused]] auto *DstElemTy =
+        cast<PointerType>(DstInpTy->getElementType());
+    assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
+    LLVM_DEBUG(dbgs() << "  Gather feeds into scatter\n  Values to scatter : "
+                      << *Dst->getOperand(0) << "\n");
+    LLVM_DEBUG(dbgs() << "  Dst type(" << *DstInpTy << ") elements("
+                      << DstElements << ") VecLen(" << DstInpSize << ") type("
+                      << *DstElemTy << ") Access alignment("
+                      << *Dst->getOperand(2) << ")\n");
     // Address of source
     auto *Src = getPointer(IndexLoad);
     if (!Src)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 2f59b7c0fdb15..10c350e0e2bae 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -67,6 +67,11 @@ void HexagonMCELFStreamer::emitInstruction(const MCInst &MCB,
   assert(MCB.getOpcode() == Hexagon::BUNDLE);
   assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
   assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  HexagonMCChecker Check(getContext(), *MCII, STI, const_cast<MCInst &>(MCB),
+                         *RI);
+  [[maybe_unused]] bool CheckOk = Check.check(false);
+  assert(CheckOk);
 
   // At this point, MCB is a bundle
   // Iterate through the bundle and assign addends for the instructions
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll
index 67d9e19b8975e..c7c5d5f693c55 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll
@@ -1,29 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -mattr=+v75,+hvxv75,+hvx-length128b < %s | FileCheck %s
 
-define void @f0(ptr %a0, ptr %a1, ptr %a2) #0 {
+define void @f0(ptr %a0, ptr %a1, ptr %a2) {
 ; CHECK-LABEL: f0:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7 = #-4
-; CHECK-NEXT:     v0 = vmem(r0+#0)
+; CHECK-NEXT:     [[RS:r[0-9]+]] = #-4
+; CHECK-NEXT:     [[V0:v[0-9]+]] = vmem([[A0:r[0-9]+]]+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1 = vmem(r1+#0)
+; CHECK-NEXT:     [[ACC:v[0-9]+]]:2.w = vmpy([[V0]].h,[[V1:v[0-9]+]].h)
+; CHECK-NEXT:     [[V1]].cur = vmem([[A1:r[0-9]+]]+#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1:0.w = vmpy(v0.h,v1.h)
+; CHECK-NEXT:     [[ACC]]:2.w += vmpy([[V0]].h,[[V1]].h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1:0.w = vadd(v1:0.w,v1:0.w)
+; CHECK-NEXT:     [[VDEAL:v[0-9]+]]:0 = vdeal([[ACC]],[[V2:v[0-9]+]],[[RS]])
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     v1:0 = vdeal(v1,v0,r7)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     v0.h = vpacko(v1.w,v0.w)
+; CHECK-NEXT:     [[VOUT:v[0-9]+]].h = vpacko([[VDEAL]].w,[[V0]].w)
 ; CHECK-NEXT:     jumpr r31
-; CHECK-NEXT:     vmem(r2+#0) = v0.new
+; CHECK-NEXT:     vmem([[A2:r[0-9]+]]+#0) = [[VOUT]].new
 ; CHECK-NEXT:    }
 b0:
   %v0 = load <64 x i16>, ptr %a0, align 128
@@ -47,6 +45,4 @@ b0:
   ret void
 }
 
-declare <64 x i32> @llvm.hexagon.V6.vmpyhv.128B(<32 x i32>, <32 x i32>) #0
-
-attributes #0 = { nounwind "target-features"="+v66,+hvxv66,+hvx-length128b" }
+declare <64 x i32> @llvm.hexagon.V6.vmpyhv.128B(<32 x i32>, <32 x i32>)
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/widen-setcc.ll b/llvm/test/CodeGen/Hexagon/autohvx/widen-setcc.ll
index e4765bbcb4ef9..da962143da8f7 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/widen-setcc.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/widen-setcc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -mv75 -mhvx -mattr=+hvx-length128b -hexagon-hvx-widen=32 < %s | FileCheck %s
 
 ; Make sure that this doesn't crash.
 ; CHECK-LABEL: f0:
@@ -16,5 +16,3 @@ b0:
   store <16 x i16> %v4, ptr %v0, align 2
   ret void
 }
-
-attributes #0 = { "target-features"="+hvxv66,+hvx-length128b" }
diff --git a/llvm/test/CodeGen/Hexagon/bug54537-vavg.ll b/llvm/test/CodeGen/Hexagon/bug54537-vavg.ll
new file mode 100644
index 0000000000000..5ed41e3dbbcac
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bug54537-vavg.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=hexagon -mv75 -mhvx -mattr=+hvx-length128b < %s
+; REQUIRES: asserts
+
+define dso_local void @vc1INTERP_PredictMB([64 x i8]* %pPredBlk) local_unnamed_addr {
+entry:
+  %next.gep111 = getelementptr [64 x i8], [64 x i8]* %pPredBlk, i32 0, i32 0
+  %wide.load112 = load <32 x i8>, <32 x i8>* poison, align 32
+  %0 = zext <32 x i8> %wide.load112 to <32 x i16>
+  %1 = add nuw nsw <32 x i16> zeroinitializer, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %2 = add nuw nsw <32 x i16> %1, %0
+  %3 = lshr <32 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %4 = trunc <32 x i16> %3 to <32 x i8>
+  %5 = bitcast i8* %next.gep111 to <32 x i8>*
+  store <32 x i8> %4, <32 x i8>* %5, align 1
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull null)
+  unreachable
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
diff --git a/llvm/test/CodeGen/Hexagon/extend-multiply-for-output-fpext.ll b/llvm/test/CodeGen/Hexagon/extend-multiply-for-output-fpext.ll
new file mode 100644
index 0000000000000..c4a221af9cdcd
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/extend-multiply-for-output-fpext.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvx,+hvx-length128b,+hvxv81 %s -o - | FileCheck %s
+
+; Test that on hexagon computation of a_sq_f32 is done using a widening multiply
+; instruction.
+define dso_local <64 x float> @a_sq_times_b_sq(<64 x half>  %a, <64 x float> %b) {
+entry:
+  %a_sq_f16 = fmul <64 x half> %a, %a
+  %a_sq_f32 = fpext <64 x half> %a_sq_f16 to <64 x float>
+  %b_sq = fmul <64 x float> %b, %b
+  %result = fmul <64 x float> %a_sq_f32, %b_sq
+  ret <64 x float> %result
+}
+; CHECK: a_sq_times_b_sq
+; CHECK: .qf32 = vmpy({{v[0-9]+}}.hf,{{v[0-9]+}}.hf)
+; CHECK: .qf32 = vmpy({{v[0-9]+}}.sf,{{v[0-9]+}}.sf)
+; CHECK: .qf32 = vmpy({{v[0-9]+}}.sf,{{v[0-9]+}}.sf)
diff --git a/llvm/test/CodeGen/Hexagon/no_widening_of_bf16_vecmul.ll b/llvm/test/CodeGen/Hexagon/no_widening_of_bf16_vecmul.ll
new file mode 100644
index 0000000000000..8fa293fc23f5d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/no_widening_of_bf16_vecmul.ll
@@ -0,0 +1,60 @@
+;; RUN: llc --mtriple=hexagon --mcpu=hexagonv81 --mattr=+hvxv81,+hvx-length128b %s -o - | FileCheck %s
+
+; In this file, we check that fmul(exttof32(v1.bf16), exttof32(v2.bf16)) is not
+; transformed to exttof32(fmul(v1.hf, v2.hf)). This was a bug in
+; hexagon-widening-vector pass.
+
+define void @halfbf16(ptr readonly %x, ptr %y) {
+entry:
+  %xvec.bf16 = load <64 x bfloat>, ptr %x, align 2
+  %xvec.f32 = fpext <64 x bfloat> %xvec.bf16 to <64 x float>
+  %yvec.f32 = fmul <64 x float> %xvec.f32, splat (float 5.000000e-01)
+  %yvec.bf16 = fptrunc <64 x float> %yvec.f32 to <64 x bfloat>
+  store <64 x bfloat> %yvec.bf16, ptr %y, align 2
+  ret void
+}
+;; CHECK: vmpy(v{{[0-9]+}}.sf,v{{[0-9]+}}.sf)
+;; CHECK: vmpy(v{{[0-9]+}}.sf,v{{[0-9]+}}.sf)
+
+
+define void @vecmulbf16(ptr readonly %x, ptr readonly %y, ptr %z) {
+entry:
+  %xvec.bf16 = load <64 x bfloat>, ptr %x, align 2
+  %yvec.bf16 = load <64 x bfloat>, ptr %y, align 2
+  %xvec.f32 = fpext <64 x bfloat> %xvec.bf16 to <64 x float>
+  %yvec.f32 = fpext <64 x bfloat> %yvec.bf16 to <64 x float>
+  %zvec.f32 = fmul <64 x float> %xvec.f32, %yvec.f32
+  %zvec.bf16 = fptrunc <64 x float> %zvec.f32 to <64 x bfloat>
+  store <64 x bfloat> %zvec.bf16, ptr %z, align 2
+  ret void
+}
+
+;; CHECK: vmpy(v{{[0-9]+}}.sf,v{{[0-9]+}}.sf)
+;; CHECK: vmpy(v{{[0-9]+}}.sf,v{{[0-9]+}}.sf)
+
+
+define void @halff16(ptr readonly %x, ptr %y) {
+entry:
+  %xvec.f16 = load <64 x half>, ptr %x, align 2
+  %xvec.f32 = fpext <64 x half> %xvec.f16 to <64 x float>
+  %yvec.f32 = fmul <64 x float> %xvec.f32, splat (float 5.000000e-01)
+  %yvec.f16 = fptrunc <64 x float> %yvec.f32 to <64 x half>
+  store <64 x half> %yvec.f16, ptr %y, align 2
+  ret void
+}
+;; CHECK: vmpy(v{{[0-9]+}}.hf,v{{[0-9]+}}.hf)
+
+
+define void @vecmulf16(ptr readonly %x, ptr readonly %y, ptr %z) {
+entry:
+  %xvec.f16 = load <64 x half>, ptr %x, align 2
+  %yvec.f16 = load <64 x half>, ptr %y, align 2
+  %xvec.f32 = fpext <64 x half> %xvec.f16 to <64 x float>
+  %yvec.f32 = fpext <64 x half> %yvec.f16 to <64 x float>
+  %zvec.f32 = fmul <64 x float> %xvec.f32, %yvec.f32
+  %zvec.f16 = fptrunc <64 x float> %zvec.f32 to <64 x half>
+  store <64 x half> %zvec.f16, ptr %z, align 2
+  ret void
+}
+
+;; CHECK: vmpy(v{{[0-9]+}}.hf,v{{[0-9]+}}.hf)
diff --git a/llvm/test/CodeGen/Hexagon/shortvec-vasrsat.ll b/llvm/test/CodeGen/Hexagon/shortvec-vasrsat.ll
new file mode 100644
index 0000000000000..99db9f9c9354a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/shortvec-vasrsat.ll
@@ -0,0 +1,68 @@
+
+; RUN: llc -march=hexagon -hexagon-hvx-widen=32 -hexagon-widen-short-vector -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; CHECK-LABEL: test_vasr
+; CHECK: = vasr{{.*}}:sat
+
+define dllexport void @test_vasr(i64 %seed0, i64 %seed1, i8* %dst) local_unnamed_addr {
+entry:
+  %1 = trunc i64 %seed0 to i32
+  %t.1 = trunc i64 %seed1 to i32
+  %2 = lshr i32 %t.1, 23
+  %3 = and i32 %2, 255
+  %4 = icmp ugt i32 %3, 125
+  %5 = select i1 %4, i32 %3, i32 125
+  %6 = sub nsw i32 132, %5
+  %7 = shl i32 %1, %6
+  %8 = trunc i32 %7 to i16
+  %9 = trunc i32 %6 to i16
+
+  %broadcast.splatinsert50 = insertelement <64 x i16> poison, i16 %8, i32 0
+  %broadcast.splat51 = shufflevector <64 x i16> %broadcast.splatinsert50, <64 x i16> poison, <64 x i32> zeroinitializer
+  %broadcast.splatinsert52 = insertelement <64 x i16> poison, i16 %9, i32 0
+  %broadcast.splat53 = shufflevector <64 x i16> %broadcast.splatinsert52, <64 x i16> poison, <64 x i32> zeroinitializer
+
+  %11 = call <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16> zeroinitializer, <64 x i16> %broadcast.splat51)
+  %12 = ashr <64 x i16> %11, %broadcast.splat53
+  %13 = icmp slt <64 x i16> %12, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %14 = select <64 x i1> %13, <64 x i16> %12, <64 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %15 = icmp sgt <64 x i16> %14, zeroinitializer
+  %16 = select <64 x i1> %15, <64 x i16> %14, <64 x i16> zeroinitializer
+  %17 = trunc <64 x i16> %16 to <64 x i8>
+  %elem = extractelement <64 x i8> %17, i32 0
+  store i8 %elem, i8* %dst, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_vasr_with_intrinsic
+; CHECK: v{{[0-9:]+}}.ub = vasr(v{{[0-9]+}}.h,v{{[0-9]+}}.h,r{{[0-9]+}}):sat
+
+define dllexport void @test_vasr_with_intrinsic(i64 %seed0, i64 %seed1, i8* %dst) local_unnamed_addr {
+entry:
+  %1 = trunc i64 %seed0 to i32
+  %t.1 = trunc i64 %seed1 to i32
+  %2 = lshr i32 %t.1, 23
+  %3 = and i32 %2, 255
+  %4 = icmp ugt i32 %3, 125
+  %5 = select i1 %4, i32 %3, i32 125
+  %6 = sub nsw i32 132, %5
+  %7 = shl i32 %1, %6
+  %8 = trunc i32 %7 to i16
+  %9 = trunc i32 %6 to i16
+
+  %broadcast.splatinsert50 = insertelement <64 x i16> poison, i16 %8, i32 0
+  %broadcast.splat51 = shufflevector <64 x i16> %broadcast.splatinsert50, <64 x i16> poison, <64 x i32> zeroinitializer
+
+  %11 = call <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16> zeroinitializer, <64 x i16> %broadcast.splat51)
+  %12 = ashr <64 x i16> %11, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  %13 = call <64 x i16> @llvm.smin.v64i16(<64 x i16> %12, <64 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>)
+  %14 = call <64 x i16> @llvm.smax.v64i16(<64 x i16> %13, <64 x i16> zeroinitializer)
+  %15 = trunc <64 x i16> %14 to <64 x i8>
+  %elem = extractelement <64 x i8> %15, i32 0
+  store i8 %elem, i8* %dst, align 1
+  ret void
+}
+
+declare <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16>, <64 x i16>)
+declare <64 x i16> @llvm.smin.v64i16(<64 x i16>, <64 x i16>)
+declare <64 x i16> @llvm.smax.v64i16(<64 x i16>, <64 x i16>)
diff --git a/llvm/test/CodeGen/Hexagon/shortvec-vavg.ll b/llvm/test/CodeGen/Hexagon/shortvec-vavg.ll
new file mode 100644
index 0000000000000..38030acf13329
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/shortvec-vavg.ll
@@ -0,0 +1,20 @@
+
+; RUN: llc -march=hexagon -hexagon-hvx-widen=32 -hexagon-widen-short-vector -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; CHECK: = vavg(v{{[0-9:]+}}.h,v{{[0-9]+}}.h)
+
+define dllexport void @tvm_vavg(i8 %val0, i8 %val1, i8* %dst) local_unnamed_addr {
+entry:
+  %1 = insertelement <64 x i8> poison, i8 %val0, i32 62
+  %2 = insertelement <64 x i8> %1, i8 %val1, i32 63
+  %3 = zext <64 x i8> %2 to <64 x i16>
+  %t.7 = insertelement <64 x i8> poison, i8 %val1, i32 62
+  %t.8 = insertelement <64 x i8> %t.7, i8 %val0, i32 63
+  %t.9 = zext <64 x i8> %t.8 to <64 x i16>
+  %t.17 = add nuw nsw <64 x i16> %t.9, %3
+  %t.18 = lshr <64 x i16> %t.17, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %t.19 = trunc <64 x i16> %t.18 to <64 x i8>
+  %t.29 = extractelement <64 x i8> %t.19, i32 6
+  store i8 %t.29, i8* %dst, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/shortvec-vmpy.ll b/llvm/test/CodeGen/Hexagon/shortvec-vmpy.ll
new file mode 100644
index 0000000000000..994a847b31aac
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/shortvec-vmpy.ll
@@ -0,0 +1,27 @@
+
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b -hexagon-hvx-widen=32 -hexagon-widen-short-vector < %s | FileCheck %s
+
+; CHECK: {{[0-9]+:[0-9]+}}.uh = vmpy(v{{[0-9]+}}.ub,v{{[0-9]+}}.ub)
+
+define dllexport void @test_vmpy(i64 %seed, i8 %val, i8* %dst) local_unnamed_addr {
+entry:
+  ; Replace poison loads with args
+  %t.1 = trunc i64 %seed to i16
+  %0 = lshr i16 %t.1, 7
+  %1 = and i16 %0, 255
+  %broadcast.splatinsert44 = insertelement <64 x i16> poison, i16 %1, i32 0
+  %broadcast.splat45 = shufflevector <64 x i16> %broadcast.splatinsert44, <64 x i16> poison, <64 x i32> zeroinitializer
+  %3 = insertelement <64 x i8> poison, i8 %val, i32 57
+  %4 = insertelement <64 x i8> %3, i8 %val, i32 58
+  %5 = insertelement <64 x i8> %4, i8 %val, i32 59
+  %6 = insertelement <64 x i8> %5, i8 %val, i32 60
+  %7 = insertelement <64 x i8> %6, i8 %val, i32 61
+  %8 = insertelement <64 x i8> %7, i8 %val, i32 62
+  %9 = insertelement <64 x i8> %8, i8 %val, i32 63
+  %10 = zext <64 x i8> %9 to <64 x i16>
+  %11 = mul nuw <64 x i16> %broadcast.splat45, %10
+  %12 = trunc <64 x i16> %11 to <64 x i8>
+  %elem = extractelement <64 x i8> %12, i32 0
+  store i8 %elem, i8* %dst, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/vadd-const.ll b/llvm/test/CodeGen/Hexagon/vadd-const.ll
new file mode 100644
index 0000000000000..da6ccffc0093d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vadd-const.ll
@@ -0,0 +1,114 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; Make sure that the appropriate vadd instructions are generated when
+; addtiplied with a vector of constant values.
+
+; CHECK-LABEL: test_vadd_const1
+; CHECK: [[REG0:(r[0-9]+)]] = #
+; CHECK: [[VREG0:(v[0-9]+)]].b = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.h = vadd(v{{[0-9]+}}.ub,[[VREG0]].ub)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const1(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = add nuw nsw <128 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const2
+; CHECK: [[REG0:(r[0-9]+)]] = #
+; CHECK: [[VREG0:([0-9]+)]].h = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9]+)]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.h = vadd({{.*}}.h,{{v[VREG0]|v[VREG1]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const2(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = add nsw <128 x i32> %0, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const2_1
+; CHECK: [[REG0:(r[0-9]+)]] = #-270
+; CHECK: [[VREG0:([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9:]+)]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}.w,{{v[VREG1]|v[VREG0]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const2_1(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = add nsw <128 x i32> %0, <i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+
+
+; CHECK-LABEL: test_vadd_const3
+; CHECK: [[REG0:(r[0-9]+)]] = #
+; CHECK: [[VREG0:(v[0-9]+)]].h = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.w = vadd(v{{[0-9]+}}.uh,[[VREG0]].uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const3(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = add nuw nsw <64 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const4
+; CHECK: [[REG0:(r[0-9]+)]] = #-23
+; CHECK: [[VREG0:([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9:]+)]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}.w,{{v[VREG1]|v[VREG0]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const4(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = add nsw <64 x i32> %0, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const5
+; CHECK: [[REG0:(r[0-9]+)]] = #-257
+; CHECK: [[VREG0:([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9:]+)]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}.w,{{v[VREG1]|v[VREG0]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const5(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = add nsw <64 x i32> %0, <i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const6
+; CHECK: [[REG0:(r[0-9]+)]] = #-23
+; CHECK: [[VREG0:(v[0-9]+)]] = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}[[VREG0]].w{{.*}})
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const6(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <32 x i32>, ptr %a, align 4
+  %0 = add nsw <32 x i32> %wide.load, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <32 x i32> %0, ptr %r, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/vasr-sat.ll b/llvm/test/CodeGen/Hexagon/vasr-sat.ll
new file mode 100644
index 0000000000000..9ad9666dd574f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vasr-sat.ll
@@ -0,0 +1,66 @@
+
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; Test for saturating vasr instruction.
+
+; CHECK-LABEL: test_vasr
+; CHECK: = vasr{{.*}}:sat
+
+define dllexport void @test_vasr(i64 %seed0, i64 %seed1,
+                                 i8* %dst) local_unnamed_addr {
+entry:
+  %1 = trunc i64 %seed0 to i32
+  %t.1 = trunc i64 %seed1 to i32
+  %2 = lshr i32 %t.1, 23
+  %3 = and i32 %2, 255
+  %4 = icmp ugt i32 %3, 125
+  %5 = select i1 %4, i32 %3, i32 125
+  %6 = sub nsw i32 132, %5
+  %7 = shl i32 %1, %6
+  %8 = trunc i32 %7 to i16
+  %9 = trunc i32 %6 to i16
+
+  ; Broadcast splats
+  %broadcast.splatinsert216 = insertelement <128 x i16> poison, i16 %9, i32 0
+  %broadcast.splat217 = shufflevector <128 x i16> %broadcast.splatinsert216, <128 x i16> poison, <128 x i32> zeroinitializer
+  %broadcast.splatinsert214 = insertelement <128 x i16> poison, i16 %8, i32 0
+  %broadcast.splat215 = shufflevector <128 x i16> %broadcast.splatinsert214, <128 x i16> poison, <128 x i32> zeroinitializer
+  %11 = ashr <128 x i16> %broadcast.splat215, %broadcast.splat217
+  %12 = icmp slt <128 x i16> %11, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %13 = select <128 x i1> %12, <128 x i16> %11, <128 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %14 = icmp sgt <128 x i16> %13, zeroinitializer
+  %15 = select <128 x i1> %14, <128 x i16> %13, <128 x i16> zeroinitializer
+  %16 = trunc <128 x i16> %15 to <128 x i8>
+  %17 = bitcast i8* %dst to <128 x i8>*
+  store <128 x i8> %16, <128 x i8>* %17, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_vasr_with_intrinsic
+; CHECK: = vasr{{.*}}:sat
+
+define dllexport void @test_vasr_with_intrinsic(i64 %seed0, i64 %seed1,
+                                                i8* %dst) local_unnamed_addr {
+entry:
+  %1 = trunc i64 %seed0 to i32
+  %t.1 = trunc i64 %seed1 to i32
+  %2 = lshr i32 %t.1, 23
+  %3 = and i32 %2, 255
+  %4 = icmp ugt i32 %3, 125
+  %5 = select i1 %4, i32 %3, i32 125
+  %6 = sub nsw i32 132, %5
+  %7 = shl i32 %1, %6
+  %8 = trunc i32 %7 to i16
+  %broadcast.splatinsert214 = insertelement <128 x i16> poison, i16 %8, i32 0
+  %broadcast.splat215 = shufflevector <128 x i16> %broadcast.splatinsert214, <128 x i16> poison, <128 x i32> zeroinitializer
+  %11 = ashr <128 x i16> %broadcast.splat215, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  %12 = call <128 x i16> @llvm.smin.v128i16(<128 x i16> %11, <128 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>)
+  %13 = call <128 x i16> @llvm.smax.v128i16(<128 x i16> %12, <128 x i16> zeroinitializer)
+  %14 = trunc <128 x i16> %13 to <128 x i8>
+  %15 = bitcast i8* %dst to <128 x i8>*
+  store <128 x i8> %14, <128 x i8>* %15, align 1
+  ret void
+}
+
+declare <128 x i16> @llvm.smin.v128i16(<128 x i16>, <128 x i16>)
+declare <128 x i16> @llvm.smax.v128i16(<128 x i16>, <128 x i16>)
diff --git a/llvm/test/CodeGen/Hexagon/vavg.ll b/llvm/test/CodeGen/Hexagon/vavg.ll
new file mode 100644
index 0000000000000..70c0e482937d7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vavg.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=hexagon -mv73 -mhvx -mattr=+hvx-length128b < %s | FileCheck %s
+; Test for vmpa instruction.
+
+; CHECK: = vavg(v{{[0-9:]+}}.uh,v{{[0-9]+}}.uh)
+
+define dllexport void @test_vavg(float %f0, float %f1,
+                                 <128 x i8> %src,
+                                 i16* %dst) local_unnamed_addr {
+entry:
+  %0 = select i1 false, float %f0, float %f1
+  %1 = fptosi float %0 to i16
+  %2 = lshr i16 %1, 7
+  %3 = and i16 %2, 255
+  %4 = and i16 %1, 127
+  %broadcast.splatinsert212.1 = insertelement <128 x i16> poison, i16 %4, i32 0
+  %broadcast.splat213.1 = shufflevector <128 x i16> %broadcast.splatinsert212.1, <128 x i16> poison, <128 x i32> zeroinitializer
+  %broadcast.splatinsert208.1 = insertelement <128 x i16> poison, i16 %3, i32 0
+  %broadcast.splat209.1 = shufflevector <128 x i16> %broadcast.splatinsert208.1, <128 x i16> poison, <128 x i32> zeroinitializer
+  %7 = zext <128 x i8> %src to <128 x i16>
+  %8 = mul nuw <128 x i16> %broadcast.splat209.1, %7
+  %9 = add <128 x i16> %8, zeroinitializer
+  %10 = zext <128 x i16> %9 to <128 x i32>
+  %11 = mul nuw nsw <128 x i16> %broadcast.splat213.1, %7
+  %12 = add nuw <128 x i16> %11, zeroinitializer
+  %13 = lshr <128 x i16> %12, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %14 = zext <128 x i16> %13 to <128 x i32>
+  %15 = add nuw nsw <128 x i32> %14, %10
+  %16 = lshr <128 x i32> %15, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %17 = trunc <128 x i32> %16 to <128 x i16>
+  %19 = bitcast i16* %dst to <128 x i16>*
+  store <128 x i16> %17, <128 x i16>* %19, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/vec-shuff-invalid-operand.ll b/llvm/test/CodeGen/Hexagon/vec-shuff-invalid-operand.ll
new file mode 100644
index 0000000000000..8479d579e8bdd
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vec-shuff-invalid-operand.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=hexagon -mv75 -mhvx -mattr=+hvx-length128b < %s | FileCheck %s
+
+; HexagonOptShuffleVector moved the shufflevector instruction to after
+; the last add:
+;   %v5 = add nsw <128 x i32> %v4, %a0
+; That is incorrect, because the order of elements in the %a0 operand
+; will not reflect the new shuffle.
+
+; CHECK: vadd
+; CHECK-NOT: vshuff
+
+define dllexport void @f0(<128 x i32> %a0) local_unnamed_addr {
+b0:
+  %v0 = load <128 x i8>, <128 x i8>* poison, align 128
+  %v1 = call <128 x i16> @llvm.hexagon.vmpy.uu.v128i16(<128 x i8> %v0, <128 x i8> <i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96, i8 96>)
+  %v2 = shufflevector <128 x i16> %v1, <128 x i16> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+  %v3 = zext <128 x i16> %v2 to <128 x i32>
+  %v4 = add nsw <128 x i32> %v3, <i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648>
+  %v5 = add nsw <128 x i32> %v4, %a0
+  %v6 = getelementptr <128 x i32>, <128 x i32>* null, i32 -1
+  store <128 x i32> %v5, <128 x i32>* %v6, align 128
+  call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare <128 x i16> @llvm.hexagon.vmpy.uu.v128i16(<128 x i8>, <128 x i8>)
+
+; Function Attrs: cold noreturn nounwind
+declare void @llvm.trap()
diff --git a/llvm/test/CodeGen/Hexagon/vec-shuff-multi-uses.ll b/llvm/test/CodeGen/Hexagon/vec-shuff-multi-uses.ll
new file mode 100644
index 0000000000000..1fd5f9b20f8d0
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vec-shuff-multi-uses.ll
@@ -0,0 +1,290 @@
+; RUN: llc -march=hexagon -mattr=+hvxv68,+hvx-length128b -hexagon-opt-shuffvec=true < %s | FileCheck %s
+
+; This test corresponds to a case where a shufflevector with multiple uses
+; was getting incorrectly relocated. The problem was that only one of the uses
+; met the safety checks but the pass didn't keep track of it so both
+; uses were getting updated at the time of relocation.
+
+; CHECK-NOT:	Relocating after --   {{.*}} = add nuw nsw <128 x i32>
+
+@.str = private unnamed_addr constant [6 x i8] c"vbor \00", align 1
+
+; Function Attrs: nounwind
+define dso_local void @vbor(i32 %ntimes, i32 %n, double %ctime, double %dtime, i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, [128 x i8]* %aa, [128 x i8]* %bb, [128 x i8]* %cc) local_unnamed_addr {
+entry:
+  %s = alloca [128 x i8], align 8
+  %0 = getelementptr inbounds [128 x i8], [128 x i8]* %s, i32 0, i32 0
+  call void @llvm.lifetime.start.p0i8(i64 128, i8* nonnull %0)
+  tail call void @init(i32 %n, i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, [128 x i8]* %aa, [128 x i8]* %bb, [128 x i8]* %cc, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0))
+  %call = tail call i32 bitcast (i32 (...)* @second to i32 ()*)()
+  %cmp3261 = icmp sgt i32 %n, 0
+  %cmp263 = icmp sgt i32 %ntimes, 0
+  br i1 %cmp263, label %for.cond2.preheader.preheader, label %for.end141
+
+for.cond2.preheader.preheader:
+  %min.iters.check = icmp ult i32 %n, 64
+  %min.iters.check272 = icmp ult i32 %n, 128
+  %n.vec = and i32 %n, -128
+  %cmp.n = icmp eq i32 %n.vec, %n
+  %n.vec.remaining = and i32 %n, 64
+  %min.epilog.iters.check.not.not = icmp eq i32 %n.vec.remaining, 0
+  %n.vec278 = and i32 %n, -64
+  %cmp.n281 = icmp eq i32 %n.vec278, %n
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.end, %for.cond2.preheader.preheader
+  %nl.0264 = phi i32 [ %inc140, %for.end ], [ 0, %for.cond2.preheader.preheader ]
+  br i1 %cmp3261, label %iter.check, label %for.end
+
+iter.check:                                       ; preds = %for.cond2.preheader
+  br i1 %min.iters.check, label %for.body5.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check:                      ; preds = %iter.check
+  br i1 %min.iters.check272, label %vec.epilog.ph, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.main.loop.iter.check
+  %index = phi i32 [ %index.next, %vector.body ], [ 0, %vector.main.loop.iter.check ]
+  %wide.load = load <128 x i8>, <128 x i8>* poison, align 1
+  %wide.load273 = load <128 x i8>, <128 x i8>* poison, align 1
+  %wide.load274 = load <128 x i8>, <128 x i8>* poison, align 1
+  %wide.load275 = load <128 x i8>, <128 x i8>* poison, align 1
+  %wide.load276 = load <128 x i8>, <128 x i8>* poison, align 1
+  %wide.load511 = load <128 x i8>, <128 x i8>* poison, align 1
+  %1 = zext <128 x i8> %wide.load to <128 x i32>
+  %2 = zext <128 x i8> %wide.load273 to <128 x i32>
+  %3 = mul nuw nsw <128 x i32> %2, %1
+  %4 = zext <128 x i8> %wide.load274 to <128 x i32>
+  %5 = zext <128 x i8> %wide.load275 to <128 x i32>
+  %6 = zext <128 x i8> %wide.load276 to <128 x i32>
+  %7 = zext <128 x i8> %wide.load511 to <128 x i32>
+  %8 = add nuw nsw <128 x i32> %6, %5
+  %9 = add nuw nsw <128 x i32> %8, %4
+  %10 = add nuw nsw <128 x i32> %9, %7
+  %11 = mul nuw nsw <128 x i32> %3, %10
+  %12 = mul nuw nsw <128 x i32> %4, %1
+  %13 = mul nuw nsw <128 x i32> %12, %5
+  %14 = mul nuw nsw <128 x i32> %5, %1
+  %15 = mul nuw nsw <128 x i32> %6, %1
+  %16 = add nuw nsw <128 x i32> %14, %12
+  %17 = add nuw nsw <128 x i32> %16, %15
+  %18 = mul nuw nsw <128 x i32> %17, %7
+  %19 = mul nuw nsw <128 x i32> %16, %6
+  %20 = add nuw nsw <128 x i32> %19, %13
+  %21 = add nuw nsw <128 x i32> %20, %11
+  %22 = add nuw nsw <128 x i32> %21, %18
+  %23 = add nuw nsw <128 x i32> %8, %7
+  %24 = mul nuw nsw <128 x i32> %23, %4
+  %25 = mul nuw nsw <128 x i32> %7, %6
+  %26 = add nuw nsw <128 x i32> %24, %25
+  %27 = add nuw nsw <128 x i32> %7, %6
+  %28 = mul nuw nsw <128 x i32> %27, %5
+  %29 = add nuw nsw <128 x i32> %26, %28
+  %30 = mul nuw nsw <128 x i32> %29, %2
+  %31 = add <128 x i8> %wide.load511, %wide.load276
+  %32 = mul <128 x i8> %31, %wide.load275
+  %33 = mul <128 x i8> %wide.load511, %wide.load276
+  %34 = add <128 x i8> %32, %33
+  %35 = shl <128 x i32> %22, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %36 = ashr exact <128 x i32> %35, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %37 = shl <128 x i32> %30, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %38 = ashr exact <128 x i32> %37, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %39 = mul nsw <128 x i32> %36, %38
+  %40 = trunc <128 x i32> %39 to <128 x i8>
+  %41 = mul <128 x i8> %33, %wide.load274
+  %42 = mul <128 x i8> %41, %wide.load275
+  %43 = mul <128 x i8> %42, %34
+  %44 = mul <128 x i8> %43, %40
+  %45 = getelementptr inbounds [128 x i8], [128 x i8]* %s, i32 0, i32 %index
+  %46 = bitcast i8* %45 to <128 x i8>*
+  store <128 x i8> %44, <128 x i8>* %46, align 8
+  %index.next = add nuw i32 %index, 128
+  %47 = icmp eq i32 %index.next, %n.vec
+  br i1 %47, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  br i1 %cmp.n, label %for.end, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  br i1 %min.epilog.iters.check.not.not, label %for.body5.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vec.epilog.iter.check, %vector.main.loop.iter.check
+  %vec.epilog.resume.val = phi i32 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index279 = phi i32 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next280, %vec.epilog.vector.body ]
+  %48 = getelementptr inbounds i8, i8* %a, i32 %index279
+  %49 = bitcast i8* %48 to <64 x i8>*
+  %wide.load282 = load <64 x i8>, <64 x i8>* %49, align 1
+  %50 = getelementptr inbounds i8, i8* %b, i32 %index279
+  %51 = bitcast i8* %50 to <64 x i8>*
+  %wide.load283 = load <64 x i8>, <64 x i8>* %51, align 1
+  %52 = getelementptr inbounds i8, i8* %c, i32 %index279
+  %53 = bitcast i8* %52 to <64 x i8>*
+  %wide.load284 = load <64 x i8>, <64 x i8>* %53, align 1
+  %54 = getelementptr inbounds i8, i8* %d, i32 %index279
+  %55 = bitcast i8* %54 to <64 x i8>*
+  %wide.load285 = load <64 x i8>, <64 x i8>* %55, align 1
+  %56 = getelementptr inbounds i8, i8* %e, i32 %index279
+  %57 = bitcast i8* %56 to <64 x i8>*
+  %wide.load286 = load <64 x i8>, <64 x i8>* %57, align 1
+  %wide.load312 = load <64 x i8>, <64 x i8>* poison, align 1
+  %58 = zext <64 x i8> %wide.load282 to <64 x i32>
+  %59 = zext <64 x i8> %wide.load283 to <64 x i32>
+  %60 = mul nuw nsw <64 x i32> %59, %58
+  %61 = zext <64 x i8> %wide.load284 to <64 x i32>
+  %62 = zext <64 x i8> %wide.load285 to <64 x i32>
+  %63 = zext <64 x i8> %wide.load286 to <64 x i32>
+  %64 = zext <64 x i8> %wide.load312 to <64 x i32>
+  %65 = add nuw nsw <64 x i32> %63, %62
+  %66 = add nuw nsw <64 x i32> %65, %61
+  %67 = add nuw nsw <64 x i32> %66, %64
+  %68 = mul nuw nsw <64 x i32> %60, %67
+  %69 = mul nuw nsw <64 x i32> %61, %58
+  %70 = mul nuw nsw <64 x i32> %69, %62
+  %71 = mul nuw nsw <64 x i32> %62, %58
+  %72 = mul nuw nsw <64 x i32> %63, %58
+  %73 = add nuw nsw <64 x i32> %71, %69
+  %74 = add nuw nsw <64 x i32> %73, %72
+  %75 = mul nuw nsw <64 x i32> %74, %64
+  %76 = mul nuw nsw <64 x i32> %73, %63
+  %77 = add nuw nsw <64 x i32> %76, %70
+  %78 = add nuw nsw <64 x i32> %77, %68
+  %79 = add nuw nsw <64 x i32> %78, %75
+  %80 = add nuw nsw <64 x i32> %65, %64
+  %81 = mul nuw nsw <64 x i32> %80, %61
+  %82 = mul nuw nsw <64 x i32> %64, %63
+  %83 = add nuw nsw <64 x i32> %81, %82
+  %84 = add nuw nsw <64 x i32> %64, %63
+  %85 = mul nuw nsw <64 x i32> %84, %62
+  %86 = add nuw nsw <64 x i32> %83, %85
+  %87 = mul nuw nsw <64 x i32> %86, %59
+  %88 = add <64 x i8> %wide.load312, %wide.load286
+  %89 = mul <64 x i8> %88, %wide.load285
+  %90 = mul <64 x i8> %wide.load312, %wide.load286
+  %91 = add <64 x i8> %89, %90
+  %92 = shl <64 x i32> %79, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %93 = ashr exact <64 x i32> %92, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %94 = shl <64 x i32> %87, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %95 = ashr exact <64 x i32> %94, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %96 = mul nsw <64 x i32> %93, %95
+  %97 = trunc <64 x i32> %96 to <64 x i8>
+  %98 = mul <64 x i8> %90, %wide.load284
+  %99 = mul <64 x i8> %98, %wide.load285
+  %100 = mul <64 x i8> %99, %91
+  %101 = mul <64 x i8> %100, %97
+  %102 = getelementptr inbounds [128 x i8], [128 x i8]* %s, i32 0, i32 %index279
+  %103 = bitcast i8* %102 to <64 x i8>*
+  store <64 x i8> %101, <64 x i8>* %103, align 8
+  %index.next280 = add nuw i32 %index279, 64
+  %104 = icmp eq i32 %index.next280, %n.vec278
+  br i1 %104, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  br i1 %cmp.n281, label %for.end, label %for.body5.preheader
+
+for.body5.preheader:                              ; preds = %vec.epilog.middle.block, %vec.epilog.iter.check, %iter.check
+  %i.0262.ph = phi i32 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec278, %vec.epilog.middle.block ]
+  br label %for.body5
+
+for.body5:                                        ; preds = %for.body5, %for.body5.preheader
+  %i.0262 = phi i32 [ %inc, %for.body5 ], [ %i.0262.ph, %for.body5.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.0262
+  %105 = load i8, i8* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8, i8* %b, i32 %i.0262
+  %106 = load i8, i8* %arrayidx6, align 1
+  %arrayidx7 = getelementptr inbounds i8, i8* %c, i32 %i.0262
+  %107 = load i8, i8* %arrayidx7, align 1
+  %arrayidx8 = getelementptr inbounds i8, i8* %d, i32 %i.0262
+  %108 = load i8, i8* %arrayidx8, align 1
+  %arrayidx9 = getelementptr inbounds i8, i8* %e, i32 %i.0262
+  %109 = load i8, i8* %arrayidx9, align 1
+  %arrayidx11 = getelementptr inbounds [128 x i8], [128 x i8]* %aa, i32 %i.0262, i32 0
+  %110 = load i8, i8* %arrayidx11, align 1
+  %conv12266 = zext i8 %105 to i32
+  %conv13267 = zext i8 %106 to i32
+  %mul = mul nuw nsw i32 %conv13267, %conv12266
+  %conv14268 = zext i8 %107 to i32
+  %conv19269 = zext i8 %108 to i32
+  %conv24270 = zext i8 %109 to i32
+  %conv30271 = zext i8 %110 to i32
+  %mul20243 = add nuw nsw i32 %conv24270, %conv19269
+  %mul25244 = add nuw nsw i32 %mul20243, %conv14268
+  %mul31245 = add nuw nsw i32 %mul25244, %conv30271
+  %add32 = mul nuw nsw i32 %mul, %mul31245
+  %mul35 = mul nuw nsw i32 %conv14268, %conv12266
+  %mul37 = mul nuw nsw i32 %mul35, %conv19269
+  %mul53 = mul nuw nsw i32 %conv19269, %conv12266
+  %mul67 = mul nuw nsw i32 %conv24270, %conv12266
+  %reass.add = add nuw nsw i32 %mul53, %mul35
+  %reass.add250 = add nuw nsw i32 %reass.add, %mul67
+  %reass.mul = mul nuw nsw i32 %reass.add250, %conv30271
+  %reass.mul252 = mul nuw nsw i32 %reass.add, %conv24270
+  %add56 = add nuw nsw i32 %reass.mul252, %mul37
+  %add62 = add nuw nsw i32 %add56, %add32
+  %add68 = add nuw nsw i32 %add62, %reass.mul
+  %mul85247 = add nuw nsw i32 %mul20243, %conv30271
+  %add86 = mul nuw nsw i32 %mul85247, %conv14268
+  %mul103 = mul nuw nsw i32 %conv30271, %conv24270
+  %reass.add253 = add nuw nsw i32 %add86, %mul103
+  %reass.add255 = add nuw nsw i32 %conv30271, %conv24270
+  %reass.mul256 = mul nuw nsw i32 %reass.add255, %conv19269
+  %reass.add259 = add nuw nsw i32 %reass.add253, %reass.mul256
+  %reass.mul260 = mul nuw nsw i32 %reass.add259, %conv13267
+  %mul115248 = add i8 %110, %109
+  %add116 = mul i8 %mul115248, %108
+  %mul121 = mul i8 %110, %109
+  %reass.add257 = add i8 %add116, %mul121
+  %sext = shl i32 %add68, 24
+  %conv130 = ashr exact i32 %sext, 24
+  %sext249 = shl i32 %reass.mul260, 24
+  %conv131 = ashr exact i32 %sext249, 24
+  %mul132 = mul nsw i32 %conv130, %conv131
+  %111 = trunc i32 %mul132 to i8
+  %112 = mul i8 %mul121, %107
+  %mul126 = mul i8 %112, %108
+  %mul128 = mul i8 %mul126, %reass.add257
+  %conv137 = mul i8 %mul128, %111
+  %arrayidx138 = getelementptr inbounds [128 x i8], [128 x i8]* %s, i32 0, i32 %i.0262
+  store i8 %conv137, i8* %arrayidx138, align 1
+  %inc = add nuw nsw i32 %i.0262, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body5
+
+for.end:                                          ; preds = %for.body5, %vec.epilog.middle.block, %middle.block, %for.cond2.preheader
+  tail call void @dummy(i32 %n, i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, [128 x i8]* %aa, [128 x i8]* %bb, [128 x i8]* %cc, i8 signext 1)
+  %inc140 = add nuw nsw i32 %nl.0264, 1
+  %exitcond265.not = icmp eq i32 %inc140, %ntimes
+  br i1 %exitcond265.not, label %for.end141, label %for.cond2.preheader
+
+for.end141:                                       ; preds = %for.end, %entry
+  %conv = sitofp i32 %call to double
+  %call142 = tail call i32 bitcast (i32 (...)* @second to i32 ()*)()
+  %conv143 = sitofp i32 %call142 to double
+  %sub = fsub double %conv143, %conv
+  %sub144 = fsub double %sub, %ctime
+  %conv145 = sitofp i32 %ntimes to double
+  %mul146 = fmul double %conv145, %dtime
+  %sub147 = fsub double %sub144, %mul146
+  %call148 = call i64 @cs1d(i32 %n, i8* nonnull %0)
+  %mul149 = mul nsw i32 %n, %ntimes
+  call void @check(i64 %call148, i32 %mul149, i32 %n, double %sub147, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0))
+  call void @llvm.lifetime.end.p0i8(i64 128, i8* nonnull %0)
+  ret void
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+
+declare dso_local void @init(i32, i8*, i8*, i8*, i8*, i8*, [128 x i8]*, [128 x i8]*, [128 x i8]*, i8*) local_unnamed_addr
+
+declare dso_local i32 @second(...) local_unnamed_addr
+
+declare dso_local void @dummy(i32, i8*, i8*, i8*, i8*, i8*, [128 x i8]*, [128 x i8]*, [128 x i8]*, i8 signext) local_unnamed_addr
+
+declare dso_local i64 @cs1d(i32, i8*) local_unnamed_addr
+
+declare dso_local void @check(i64, i32, i32, double, i8*) local_unnamed_addr
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
diff --git a/llvm/test/CodeGen/Hexagon/vec-shuff2.ll b/llvm/test/CodeGen/Hexagon/vec-shuff2.ll
new file mode 100644
index 0000000000000..d5a4091916c74
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vec-shuff2.ll
@@ -0,0 +1,106 @@
+; RUN: llc -march=hexagon -hexagon-opt-shuffvec -hexagon-widen-short-vector -hexagon-hvx-widen=32 -mv73 -mhvx -mattr=+hvx-length128b < %s
+; REQUIRES: asserts
+
+define dllexport i32 @test(ptr noalias align 128 %0, ptr noalias align 128 %1, ptr noalias align 128 %2) local_unnamed_addr {
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %0, i32 128) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %1, i32 128) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %2, i32 128) ]
+  %3 = load <32 x i8>, ptr %2, align 128
+  %4 = zext <32 x i8> %3 to <32 x i32>
+  %5 = mul nuw nsw <32 x i32> %4, <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21>
+  %scevgep = getelementptr i32, ptr %0, i32 128
+  %scevgep13 = getelementptr i8, ptr %1, i32 128
+  br label %for_begin1.preheader
+
+for_begin1.preheader:                             ; preds = %for_end3, %entry
+  %lsr.iv14 = phi ptr [ %scevgep15, %for_end3 ], [ %scevgep13, %entry ]
+  %lsr.iv1 = phi ptr [ %scevgep2, %for_end3 ], [ %scevgep, %entry ]
+  %6 = phi i32 [ 0, %entry ], [ %47, %for_end3 ]
+  br label %for_body2
+
+for_end:                                          ; preds = %for_end3
+  ret i32 0
+
+for_body2:                                        ; preds = %for_body2, %for_begin1.preheader
+  %lsr.iv16 = phi ptr [ %scevgep17, %for_body2 ], [ %lsr.iv14, %for_begin1.preheader ]
+  %lsr.iv3 = phi ptr [ %scevgep4, %for_body2 ], [ %lsr.iv1, %for_begin1.preheader ]
+  %lsr.iv = phi i32 [ %lsr.iv.next, %for_body2 ], [ 128, %for_begin1.preheader ]
+  %scevgep20 = getelementptr <32 x i8>, ptr %lsr.iv16, i32 -4
+  %7 = load <32 x i8>, ptr %scevgep20, align 128
+  %8 = zext <32 x i8> %7 to <32 x i32>
+  %9 = mul nuw nsw <32 x i32> %8, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %10 = add nsw <32 x i32> %9, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %11 = add nsw <32 x i32> %10, %5
+  %scevgep6 = getelementptr <32 x i32>, ptr %lsr.iv3, i32 -4
+  store <32 x i32> %11, ptr %scevgep6, align 128
+  %scevgep21 = getelementptr <32 x i8>, ptr %lsr.iv16, i32 -3
+  %12 = load <32 x i8>, ptr %scevgep21, align 32
+  %13 = zext <32 x i8> %12 to <32 x i32>
+  %14 = mul nuw nsw <32 x i32> %13, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %15 = add nsw <32 x i32> %14, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %16 = add nsw <32 x i32> %15, %5
+  %scevgep8 = getelementptr <32 x i32>, ptr %lsr.iv3, i32 -3
+  store <32 x i32> %16, ptr %scevgep8, align 128
+  %scevgep22 = getelementptr <32 x i8>, ptr %lsr.iv16, i32 -2
+  %17 = load <32 x i8>, ptr %scevgep22, align 64
+  %18 = zext <32 x i8> %17 to <32 x i32>
+  %19 = mul nuw nsw <32 x i32> %18, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %20 = add nsw <32 x i32> %19, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %21 = add nsw <32 x i32> %20, %5
+  %scevgep9 = getelementptr <32 x i32>, ptr %lsr.iv3, i32 -2
+  store <32 x i32> %21, ptr %scevgep9, align 128
+  %scevgep23 = getelementptr <32 x i8>, ptr %lsr.iv16, i32 -1
+  %22 = load <32 x i8>, ptr %scevgep23, align 32
+  %23 = zext <32 x i8> %22 to <32 x i32>
+  %24 = mul nuw nsw <32 x i32> %23, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %25 = add nsw <32 x i32> %24, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %26 = add nsw <32 x i32> %25, %5
+  %scevgep10 = getelementptr <32 x i32>, ptr %lsr.iv3, i32 -1
+  store <32 x i32> %26, ptr %scevgep10, align 128
+  %27 = load <32 x i8>, ptr %lsr.iv16, align 128
+  %28 = zext <32 x i8> %27 to <32 x i32>
+  %29 = mul nuw nsw <32 x i32> %28, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %30 = add nsw <32 x i32> %29, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %31 = add nsw <32 x i32> %30, %5
+  store <32 x i32> %31, ptr %lsr.iv3, align 128
+  %scevgep24 = getelementptr <32 x i8>, ptr %lsr.iv16, i32 1
+  %32 = load <32 x i8>, ptr %scevgep24, align 32
+  %33 = zext <32 x i8> %32 to <32 x i32>
+  %34 = mul nuw nsw <32 x i32> %33, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %35 = add nsw <32 x i32> %34, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %36 = add nsw <32 x i32> %35, %5
+  %scevgep12 = getelementptr <32 x i32>, ptr %lsr.iv3, i32 1
+  store <32 x i32> %36, ptr %scevgep12, align 128
+  %scevgep25 = getelementptr <32 x i8>, ptr %lsr.iv16, i32 2
+  %37 = load <32 x i8>, ptr %scevgep25, align 64
+  %38 = zext <32 x i8> %37 to <32 x i32>
+  %39 = mul nuw nsw <32 x i32> %38, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %40 = add nsw <32 x i32> %39, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %41 = add nsw <32 x i32> %40, %5
+  %scevgep11 = getelementptr <32 x i32>, ptr %lsr.iv3, i32 2
+  store <32 x i32> %41, ptr %scevgep11, align 128
+  %scevgep19 = getelementptr <32 x i8>, ptr %lsr.iv16, i32 3
+  %42 = load <32 x i8>, ptr %scevgep19, align 32
+  %43 = zext <32 x i8> %42 to <32 x i32>
+  %44 = mul nuw nsw <32 x i32> %43, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %45 = add nsw <32 x i32> %44, <i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303, i32 -9303>
+  %46 = add nsw <32 x i32> %45, %5
+  %scevgep7 = getelementptr <32 x i32>, ptr %lsr.iv3, i32 3
+  store <32 x i32> %46, ptr %scevgep7, align 128
+  %lsr.iv.next = add nsw i32 %lsr.iv, -8
+  %scevgep4 = getelementptr i32, ptr %lsr.iv3, i32 256
+  %scevgep17 = getelementptr i8, ptr %lsr.iv16, i32 256
+  %exitcond.not.7 = icmp eq i32 %lsr.iv.next, 0
+  br i1 %exitcond.not.7, label %for_end3, label %for_body2
+
+for_end3:                                         ; preds = %for_body2
+  %47 = add nuw nsw i32 %6, 1
+  %scevgep2 = getelementptr i32, ptr %lsr.iv1, i32 4096
+  %scevgep15 = getelementptr i8, ptr %lsr.iv14, i32 4096
+  %exitcond4.not = icmp eq i32 %47, 128
+  br i1 %exitcond4.not, label %for_end, label %for_begin1.preheader
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare void @llvm.assume(i1 noundef)
diff --git a/llvm/test/CodeGen/Hexagon/vmpa.ll b/llvm/test/CodeGen/Hexagon/vmpa.ll
new file mode 100644
index 0000000000000..10f18195dc1b7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vmpa.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; Test for vmpa instruction.
+
+; CHECK-LABEL: test_vmpa8
+; CHECK: = vmpa(v{{[0-9:]+}}.ub,r{{[0-9]+}}.b)
+
+; Function Attrs: nounwind
+define dllexport void @test_vmpa8(i64 %seed0, i64 %seed1,
+                                  <128 x i8> %srcA, <128 x i8> %srcB,
+                                  i8* %dst) local_unnamed_addr {
+entry:
+  %1 = trunc i64 %seed0 to i16
+  %3 = trunc i64 %seed1 to i8
+  %4 = and i8 %3, 127
+  %5 = insertelement <128 x i8> poison, i8 %4, i32 0
+  %6 = shufflevector <128 x i8> %5, <128 x i8> poison, <128 x i32> zeroinitializer
+  %7 = zext <128 x i8> %6 to <128 x i16>
+  %8 = and i16 %1, 127
+  %9 = insertelement <128 x i16> poison, i16 %8, i32 0
+  %10 = shufflevector <128 x i16> %9, <128 x i16> poison, <128 x i32> zeroinitializer
+  %11 = zext <128 x i8> %srcA to <128 x i16>
+  %12 = zext <128 x i8> %srcB to <128 x i16>
+  %13 = mul nuw nsw <128 x i16> %11, %7
+  %14 = mul nuw nsw <128 x i16> %10, %12
+  %15 = add nuw <128 x i16> %14, %13
+  %16 = lshr <128 x i16> %15, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %17 = add <128 x i16> zeroinitializer, %16
+  %18 = trunc <128 x i16> %17 to <128 x i8>
+  %21 = bitcast i8* %dst to <128 x i8>*
+  store <128 x i8> %18, <128 x i8>* %21, align 128
+  ret void
+}
+
+; CHECK-LABEL: test_vmpa16
+; CHECK: = vmpa(v{{[0-9:]+}}.uh,r{{[0-9]+}}.b)
+
+; Function Attrs: nounwind
+define dllexport void @test_vmpa16(i64 %seed0, i64 %seed1,
+                                   <64 x i16> %srcA16, <64 x i16> %srcB16,
+                                   i16* %dst16) local_unnamed_addr {
+entry:
+  %1 = trunc i64 %seed0 to i32
+  %3 = trunc i64 %seed1 to i32
+  %4 = and i32 %3, 127
+  %5 = insertelement <64 x i32> poison, i32 %4, i32 0
+  %6 = shufflevector <64 x i32> %5, <64 x i32> poison, <64 x i32> zeroinitializer
+  %7 = and i32 %3, 127
+  %8 = and i32 %1, 127
+  %9 = insertelement <64 x i32> poison, i32 %8, i32 0
+  %10 = shufflevector <64 x i32> %9, <64 x i32> poison, <64 x i32> zeroinitializer
+  %11 = zext <64 x i16> %srcA16 to <64 x i32>
+  %12 = zext <64 x i16> %srcB16 to <64 x i32>
+  %13 = mul nuw nsw <64 x i32> %11, %6
+  %14 = mul nuw nsw <64 x i32> %10, %12
+  %15 = add nuw <64 x i32> %14, %13
+  %16 = lshr <64 x i32> %15, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ;, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %17 = add <64 x i32> zeroinitializer, %16
+  %18 = trunc <64 x i32> %17 to <64 x i16>
+  %21 = bitcast i16* %dst16 to <64 x i16>*
+  store <64 x i16> %18, <64 x i16>* %21, align 128
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/vmpy-const.ll b/llvm/test/CodeGen/Hexagon/vmpy-const.ll
new file mode 100644
index 0000000000000..6b3f4c96ad364
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vmpy-const.ll
@@ -0,0 +1,273 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; Make sure that the appropriate vmpy instructions are generated when
+; multiplied with a vector of constant values.
+
+; CHECK-LABEL: test_vmpy_const1
+; CHECK: v{{[0-9:]+}}.uh = vmpy(v{{[0-9]+}}.ub,r{{[0-9]+}}.ub)
+; CHECK: v{{[0-9:]+}}.uw = vunpack(v{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const1(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = mul nuw nsw <128 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const2
+; CHECK: v{{[0-9:]+}}.h = vmpy(v{{[0-9]+}}.ub,r{{[0-9]+}}.b)
+; CHECK: v{{[0-9:]+}}.w = vunpack(v{{[0-9]+}}.h)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const2(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = mul nsw <128 x i32> %0, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const2_1
+; CHECK: [[REG0:(r[0-9]+)]] = ##-
+; CHECK: [[VREG0:(v[0-9]+)]] = vmem
+; CHECK: [[VREG1:(v[0-9]+)]] = vsplat([[REG0]])
+; CHECK: = vunpack([[VREG0]].ub)
+; CHECK: v{{[0-9:]+}}.w = vmpy([[VREG1]].h,v{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const2_1(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = mul nsw <128 x i32> %0, <i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const3
+; CHECK: v{{[0-9:]+}}.uw = vmpy(v{{[0-9]+}}.uh,r{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const3(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = mul nuw nsw <64 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const4
+; CHECK: [[REG0:(r[0-9]+)]] = #-
+; CHECK: [[VREG0:(v[0-9]+)]].h = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.w = vmpy([[VREG0]].h,v{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const4(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = mul nsw <64 x i32> %0, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const5
+; CHECK: [[REG0:(r[0-9]+)]] = #-
+; CHECK: [[VREG0:(v[0-9]+)]].h = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.w = vmpy([[VREG0]].h,v{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const5(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = mul nsw <64 x i32> %0, <i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const6
+; CHECK: [[REG0:(r[0-9]+)]] = #-23
+; CHECK: [[VREG0:(v[0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG1:(v[0-9:]+.w)]] = vmpyieo(v{{[0-9]+}}.h,[[VREG0]].h)
+; CHECK: [[VREG1]] += vmpyie
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const6(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <32 x i32>, ptr %a, align 4
+  %0 = mul nsw <32 x i32> %wide.load, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <32 x i32> %0, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const7
+; CHECK: [[REG0:(r[0-9]+)]] = ##.L
+; CHECK: [[VREG0:(v[0-9]+)]] = vmemu(r0+#0)
+; CHECK: [[VREG1:(v[0-9]+)]] = vmem([[REG0]]+#0)
+; CHECK: v{{[0-9:]+}}.h = vmpy([[VREG0]].ub,[[VREG1]].b)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const7(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = mul nsw <128 x i32> %0, <i32 -50, i32 -49, i32 -48, i32 -47, i32 -46, i32 -45, i32 -44, i32 -43, i32 -42, i32 -41, i32 -40, i32 -39, i32 -38, i32 -37, i32 -36, i32 -35, i32 -34, i32 -33, i32 -32, i32 -31, i32 -30, i32 -29, i32 -28, i32 -27, i32 -26, i32 -25, i32 -24, i32 -23, i32 -22, i32 -21, i32 -20, i32 -19, i32 -18, i32 -17, i32 -16, i32 -15, i32 -14, i32 -13, i32 -12, i32 -11, i32 -10, i32 -9, i32 -8, i32 -7, i32 -6, i32 -5, i32 -4, i32 -3, i32 -2, i32 -1, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const8
+; CHECK: v{{[0-9:]+}}.uh = vmpy(v{{[0-9]+}}.ub,r{{[0-9]+}}.ub)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const8(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i16>
+  %1 = mul nuw nsw <128 x i16> %0, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20>
+  store <128 x i16> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const9
+; CHECK: v{{[0-9:]+}}.h = vmpy(v{{[0-9]+}}.ub,r{{[0-9]+}}.b)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const9(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i16>
+  %1 = mul nuw nsw <128 x i16> %0, <i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20, i16 -20>
+  store <128 x i16> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const10
+; CHECK: v{{[0-9:]+}}.uw = vmpy(v{{[0-9]+}}.uh,r{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const10(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i16>, ptr %a, align 1
+  %0 = zext <128 x i16> %wide.load to <128 x i32>
+  %1 = mul nuw nsw <128 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const11
+; CHECK: v{{[0-9:]+}}.w = vmpy(v{{[0-9]+}}.h,r{{[0-9]+}}.h)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const11(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i16>, ptr %a, align 1
+  %0 = sext <128 x i16> %wide.load to <128 x i32>
+  %1 = mul nuw nsw <128 x i32> %0, <i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20, i32 -20>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const12
+; CHECK: [[VREG0:(v[0-9]+)]] = vmemu(r{{[0-9\+\#0-9]+}})
+; CHECK: v{{[0-9:]+}}.h = vmpy(v{{[0-9]+}}.ub,[[VREG0]].b)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const12(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = sext <128 x i8> %wide.load to <128 x i16>
+  %1 = mul nuw nsw <128 x i16> %0, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20>
+  store <128 x i16> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const13
+; CHECK: [[VREG0:(v[0-9]+)]] = vmemu(r{{[0-9\+\#0-9]+}})
+; CHECK: v{{[0-9:]+}}.w = vmpy([[VREG0]].h,v{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const13(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i16>, ptr %a, align 1
+  %0 = sext <128 x i16> %wide.load to <128 x i32>
+  %1 = mul nuw nsw <128 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const14
+; CHECK: v{{[0-9:]+}}.uh = vmpy(v{{[0-9]+}}.ub,r{{[0-9]+}}.ub)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const14(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i16>
+  %1 = shl nuw nsw <128 x i16> %0, <i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6>
+  store <128 x i16> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const15
+; CHECK: v{{[0-9:]+}}.uh = vunpack(v{{[0-9]+}}.ub)
+; CHECK: v{{[0-9:]+}}.h = vasl(v{{[0-9]+}}.h,r{{[0-9]+}})
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const15(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i16>
+  %1 = shl nuw nsw <128 x i16> %0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  store <128 x i16> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const16
+; CHECK: v{{[0-9:]+}}.uw = vmpy(v{{[0-9]+}}.uh,r{{[0-9]+}}.uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const16(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i16>, ptr %a, align 1
+  %0 = zext <128 x i16> %wide.load to <128 x i32>
+  %1 = shl nuw nsw <128 x i32> %0, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy_const17
+; CHECK: v{{[0-9:]+}}.uw = vunpack(v{{[0-9]+}}.uh)
+; CHECK: v{{[0-9:]+}}.w = vasl(v{{[0-9]+}}.w,r{{[0-9]+}})
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const17(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i16>, ptr %a, align 1
+  %0 = zext <128 x i16> %wide.load to <128 x i32>
+  %1 = shl nuw nsw <128 x i32> %0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: test_vmpy_const18
+; CHECK: r{{[0-9]+}} = #2
+; CHECK: v{{[0-9:]+}}.b = vsplat(r{{[0-9]+}})
+; CHECK: v{{[0-9:]+}}.h = vmpy(v{{[0-9]+}}.ub,v{{[0-9]+}}.b)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy_const18(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = sext <128 x i8> %wide.load to <128 x i32>
+  %1 = shl nsw <128 x i32> %0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/vmpy-qfp-const.ll b/llvm/test/CodeGen/Hexagon/vmpy-qfp-const.ll
new file mode 100644
index 0000000000000..c1e61fdaacceb
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vmpy-qfp-const.ll
@@ -0,0 +1,71 @@
+; In this example operands in fmul instruction are (fpext, constant_vector). The generated assembly
+; should contains vsplat instruction followed by multiplication of two halfs whose result is of type qf32.
+; RUN: llc -march=hexagon -mattr=+hvxv69,+hvx-length128b < %s | FileCheck %s
+
+; CHECK-LABEL: check1
+; CHECK: [[REG0:(r[0-9]+)]] = ##
+; CHECK: [[VREG0:(v[0-9]+)]] = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.qf32 = vmpy(v{{[0-9]+}}.hf,[[VREG0]].hf)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @check1(half* nocapture readonly %a, float* nocapture %r) local_unnamed_addr {
+entry:
+  %0 = bitcast half* %a to <64 x half>*
+  %wide.load = load <64 x half>, <64 x half>* %0, align 2
+  %1 = fpext <64 x half> %wide.load to <64 x float>
+  %2= fmul <64 x float> %1, <float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000, float 0x3FC9980000000000>
+  %3 = bitcast float* %r to <64 x float>*
+  store <64 x float> %2, <64 x float>* %3, align 4
+  ret void
+}
+
+; Widening float vector with vector-width 128
+; CHECK-LABEL: check2
+; CHECK: v{{[0-9:]+}}.qf32 = vmpy(v{{[0-9]+}}.hf,[[VREG1:(v[0-9]+)]].hf)
+; CHECK: [[VREG1]].cur = vmem(r{{[0-9\+\#0-9]+}})
+; CHECK: v{{[0-9:]+}}.qf32 = vmpy(v{{[0-9]+}}.hf,[[VREG1]].hf)
+define dllexport void @check2(i8* noalias nocapture writeonly align 128 %0, i8* noalias nocapture readonly align 128 %1) #0 {
+  %3 = bitcast i8* %0 to <128 x float>*
+  %4 = bitcast i8* %1 to <128 x half>*
+  %5 = load <128 x half>, <128 x half>* %4, align 128
+  %6 = fpext <128 x half> %5 to <128 x float>
+  %7 = fmul nnan nsz <128 x float> %6, <float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01>
+  store <128 x float> %7, <128 x float>* %3, align 128
+  ret void
+}
+
+; Widening float vector pass do not handle instructions with
+; vector-width > 128. Instead during ISel, FPExtend on the operands
+; of FMUL will generate sequence of vmpy_qf32_hf, shuffle inst to
+; convert float16 to float32.
+; Later, vmpy_qf32_sf instruction will be generated for multiplying
+; two operands of FMUL instruction.
+; CHECK-LABEL: check3
+; CHECK: v{{[0-9:]+}}.qf32 = vmpy(v{{[0-9]+}}.sf,v{{[0-9]+}}.sf)
+define dllexport void @check3(i8* noalias nocapture writeonly align 256 %0, i8* noalias nocapture readonly align 256 %1) #0 {
+  %3 = bitcast i8* %0 to <256 x float>*
+  %4 = bitcast i8* %1 to <256 x half>*
+  %5 = load <256 x half>, <256 x half>* %4, align 128
+  %6 = fpext <256 x half> %5 to <256 x float>
+  %7 = fmul nnan nsz <256 x float> %6, <float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01, float 1.000000e+00, float 7.500000e-01, float 5.000000e-01, float 2.500000e-01>
+  store <256 x float> %7, <256 x float>* %3, align 128
+  ret void
+}
+
+; Widening float vector 32xf16
+; check4 also serve as a test case for HexagonOptShuffleVector with single Hi/Lo use case, where the pass should prevent relocating shuffle instruction generated by HexagonGenWideningVecFloatInstr, otherwise the function will be broken.
+; CHECK-LABEL: check4
+; CHECK: v{{[0-9:]+}}.qf32 = vmpy(v{{[0-9]+}}.hf,v{{[0-9:]+}}.hf)
+; CHECK: v{{[0-9:]+}} = vshuff(v{{[0-9]+}},v{{[0-9:]+}},r{{[0-9]+}})
+define dso_local void @check4(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %r) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast half* %a to <32 x half>*
+  %wide.load.0 = load <32 x half>, <32 x half>* %0, align 2
+  %1 = bitcast half* %b to <32 x half>*
+  %wide.load.1 = load <32 x half>, <32 x half>* %1, align 2
+  %2 = fpext <32 x half> %wide.load.0 to <32 x float>
+  %3 = fpext <32 x half> %wide.load.1 to <32 x float>
+  %4= fmul <32 x float> %2, %3
+  store <32 x float> %4, <32 x float>* %r, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/vsub-const.ll b/llvm/test/CodeGen/Hexagon/vsub-const.ll
new file mode 100644
index 0000000000000..236fc0928feb9
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vsub-const.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; Make sure that the appropriate vadd instructions are generated when
+; addtiplied with a vector of constant values.
+
+; CHECK-LABEL: test_vadd_const1
+; CHECK: [[REG0:(r[0-9]+)]] = #
+; CHECK: [[VREG0:(v[0-9]+)]].b = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.h = vadd(v{{[0-9]+}}.ub,[[VREG0]].ub)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const1(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = add nuw nsw <128 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const2
+; CHECK: [[REG0:(r[0-9]+)]] = #-
+; CHECK: [[VREG0:([0-9]+)]].h = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9:])]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.h = vadd(v{{[0-9:]+}}.h,{{v[VREG0]|v[VREG1]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const2(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = add nsw <128 x i32> %0, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const2_1
+; CHECK: [[REG0:(r[0-9]+)]] = #-270
+; CHECK: [[VREG0:([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9:]+)]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}.w,{{v[VREG0]|v[VREG1]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const2_1(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %1 = add nsw <128 x i32> %0, <i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270, i32 -270>
+  store <128 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const3
+; CHECK: [[REG0:(r[0-9]+)]] = #
+; CHECK: [[VREG0:(v[0-9]+)]].h = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.w = vadd(v{{[0-9]+}}.uh,[[VREG0]].uh)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const3(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = add nuw nsw <64 x i32> %0, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const4
+; CHECK: [[REG0:(r[0-9]+)]] = #-23
+; CHECK: [[VREG0:([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9:]+)]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}.w,{{v[VREG0]|v[VREG1]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const4(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = add nsw <64 x i32> %0, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const5
+; CHECK: [[REG0:(r[0-9]+)]] = #-257
+; CHECK: [[VREG0:([0-9]+)]] = vsplat([[REG0]])
+; CHECK: [[VREG1:([0-9:]+)]] = v[[VREG0]]
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}.w,{{v[VREG0]|v[VREG1]}}
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const5(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <64 x i16>, ptr %a, align 2
+  %0 = zext <64 x i16> %wide.load to <64 x i32>
+  %1 = add nsw <64 x i32> %0, <i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257, i32 -257>
+  store <64 x i32> %1, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vadd_const6
+; CHECK: [[REG0:(r[0-9]+)]] = #-23
+; CHECK: [[VREG0:(v[0-9]+)]] = vsplat([[REG0]])
+; CHECK: v{{[0-9:]+}}.w = vadd({{.*}}[[VREG0]].w{{.*}})
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd_const6(ptr nocapture readonly %a, ptr nocapture %r) local_unnamed_addr #0 {
+entry:
+  %wide.load = load <32 x i32>, ptr %a, align 4
+  %0 = add nsw <32 x i32> %wide.load, <i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23, i32 -23>
+  store <32 x i32> %0, ptr %r, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/widening-float-vec.ll b/llvm/test/CodeGen/Hexagon/widening-float-vec.ll
new file mode 100644
index 0000000000000..c696457451e1c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/widening-float-vec.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv68 < %s
+
+define void @_Z10range_flatIDF16bEvjT_S0_PS0_(i32 noundef %d, half noundef %start, half noundef %increm, ptr noundef %out) local_unnamed_addr {
+entry:
+  %d.ripple.bcast.splatinsert = insertelement <64 x i32> poison, i32 %d, i64 0
+  %d.ripple.bcast.splat = shufflevector <64 x i32> %d.ripple.bcast.splatinsert, <64 x i32> poison, <64 x i32> zeroinitializer
+  %0 = fpext half %increm to float
+  %.ripple.bcast.splatinsert = insertelement <64 x float> poison, float %0, i64 0
+  %.ripple.bcast.splat = shufflevector <64 x float> %.ripple.bcast.splatinsert, <64 x float> poison, <64 x i32> zeroinitializer
+  %mul.ripple.vectorized = fmul <64 x float> %.ripple.bcast.splat, <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01, float 1.600000e+01, float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01, float 2.600000e+01, float 2.700000e+01, float 2.800000e+01, float 2.900000e+01, float 3.000000e+01, float 3.100000e+01, float 3.200000e+01, float 3.300000e+01, float 3.400000e+01, float 3.500000e+01, float 3.600000e+01, float 3.700000e+01, float 3.800000e+01, float 3.900000e+01, float 4.000000e+01, float 4.100000e+01, float 4.200000e+01, float 4.300000e+01, float 4.400000e+01, float 4.500000e+01, float 4.600000e+01, float 4.700000e+01, float 4.800000e+01, float 4.900000e+01, float 5.000000e+01, float 5.100000e+01, float 5.200000e+01, float 5.300000e+01, float 5.400000e+01, float 5.500000e+01, float 5.600000e+01, float 5.700000e+01, float 5.800000e+01, float 5.900000e+01, float 6.000000e+01, float 6.100000e+01, float 6.200000e+01, float 6.300000e+01, float 6.400000e+01>
+  %arrayidx = getelementptr i8, ptr %out, i32 0
+  %1 = fptrunc <64 x float> %mul.ripple.vectorized to <64 x half>
+  store <64 x half> %1, ptr %arrayidx, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/widening-vec.ll b/llvm/test/CodeGen/Hexagon/widening-vec.ll
new file mode 100644
index 0000000000000..7b7c100bd5666
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/widening-vec.ll
@@ -0,0 +1,96 @@
+; RUN: llc -march=hexagon -mv73 -mhvx -mattr=+hvx-length128b < %s
+; REQUIRES: asserts
+
+; This test checks for an assert. It happens when we attempt to generate widening vector instructions for vector length that isn't not a multiple of HW vector size (1024).
+
+; Function Attrs: nofree norecurse nounwind
+define dllexport i32 @foo(ptr noalias nocapture %0, ptr noalias nocapture readonly %1, ptr noalias nocapture readonly %2) local_unnamed_addr {
+entry:
+  %3 = load <121 x i8>, ptr %2, align 1
+  %4 = zext <121 x i8> %3 to <121 x i32>
+  %5 = mul nuw nsw <121 x i32> %4, <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22>
+  %6 = load <121 x i8>, ptr %1, align 1
+  %7 = zext <121 x i8> %6 to <121 x i32>
+  %8 = mul nuw nsw <121 x i32> %7, <i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96>
+  %9 = add nsw <121 x i32> %8, <i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648, i32 -9648>
+  %10 = add nsw <121 x i32> %9, %5
+  store <121 x i32> %10, ptr %0, align 4
+  ret i32 0
+}
+
+; The tests below check lowering of add, sub, mul when inputs are extended from 8 to 32 bits.
+
+; CHECK-LABEL: test_vadd1
+; CHECK: v{{.*}}.h = vadd(v{{[0-9]+}}.ub,v{{[0-9]+}}.ub)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vadd1(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %wide.load19 = load <128 x i8>, ptr %b, align 1
+  %1 = zext <128 x i8> %wide.load19 to <128 x i32>
+  %2 = add nuw nsw <128 x i32> %1, %0
+  store <128 x i32> %2, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vsub1
+; CHECK: v{{.*}}.h = vsub(v{{[0-9]+}}.ub,v{{[0-9]+}}.ub)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vsub1(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %wide.load19 = load <128 x i8>, ptr %b, align 1
+  %1 = zext <128 x i8> %wide.load19 to <128 x i32>
+  %2 = sub nuw nsw <128 x i32> %1, %0
+  store <128 x i32> %2, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy1
+; CHECK: v{{.*}}.uh = vmpy(v{{[0-9]+}}.ub,v{{[0-9]+}}.ub)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy1(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = zext <128 x i8> %wide.load to <128 x i32>
+  %wide.load19 = load <128 x i8>, ptr %b, align 1
+  %1 = zext <128 x i8> %wide.load19 to <128 x i32>
+  %2 = mul nuw nsw <128 x i32> %1, %0
+  store <128 x i32> %2, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy4
+; CHECK: v{{[0-9:]+}}.h = vmpy(v{{[0-9]+}}.b,v{{[0-9]+}}.b)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy4(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = sext <128 x i8> %wide.load to <128 x i32>
+  %wide.load19 = load <128 x i8>, ptr %b, align 1
+  %1 = sext <128 x i8> %wide.load19 to <128 x i32>
+  %2 = mul nuw nsw <128 x i32> %1, %0
+  store <128 x i32> %2, ptr %r, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_vmpy7
+; CHECK: v{{[0-9:]+}}.h = vmpy(v{{[0-9]+}}.ub,v{{[0-9]+}}.b)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_vmpy7(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %r) local_unnamed_addr {
+entry:
+  %wide.load = load <128 x i8>, ptr %a, align 1
+  %0 = sext <128 x i8> %wide.load to <128 x i32>
+  %wide.load19 = load <128 x i8>, ptr %b, align 1
+  %1 = zext <128 x i8> %wide.load19 to <128 x i32>
+  %2 = mul nuw nsw <128 x i32> %1, %0
+  store <128 x i32> %2, ptr %r, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/widening-vec2.ll b/llvm/test/CodeGen/Hexagon/widening-vec2.ll
new file mode 100644
index 0000000000000..3fb288cca388c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/widening-vec2.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+; Test to make sure that the widening vector instructions are being generated.
+
+; CHECK: .uh = vmpy(v{{[0-9:]+}}.ub,v{{[0-9]+}}.ub)
+
+define dllexport void @test1() local_unnamed_addr {
+  %1 = load i64, i64* poison, align 8
+  %2 = trunc i64 %1 to i16
+  %3 = lshr i16 %2, 7
+  %4 = and i16 %3, 255
+  %broadcast.splatinsert.1 = insertelement <128 x i16> poison, i16 %4, i32 0
+  %broadcast.splat.1 = shufflevector <128 x i16> %broadcast.splatinsert.1, <128 x i16> poison, <128 x i32> zeroinitializer
+  %scevgep = getelementptr i8, i8* null, i32 128
+  %lsr.iv13 = bitcast i8* %scevgep to <128 x i8>*
+  %wide.load.1 = load <128 x i8>, <128 x i8>* poison, align 1
+  %5 = zext <128 x i8> %wide.load.1 to <128 x i16>
+  %6 = mul nuw <128 x i16> %broadcast.splat.1, %5
+  %7 = add <128 x i16> zeroinitializer, %6
+  %trun = trunc <128 x i16> %7 to <128 x i8>
+  store <128 x i8> %trun, <128 x i8>* %lsr.iv13, align 1
+  ret void
+}