diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3ba77c9a8dc90b..b9b9df35cdb040 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -941,6 +941,11 @@ class TargetTransformInfo {
   /// applies when shouldMaximizeVectorBandwidth returns true.
   unsigned getMinimumVF(unsigned ElemWidth) const;
 
+  /// \return The maximum vectorization factor for types of given element
+  /// bit width and opcode, or 0 if there is no maximum VF.
+  /// Currently only used by the SLP vectorizer.
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -1498,6 +1503,7 @@ class TargetTransformInfo::Concept {
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
+  virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() const = 0;
@@ -1917,6 +1923,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMinimumVF(unsigned ElemWidth) const override {
     return Impl.getMinimumVF(ElemWidth);
   }
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
+    return Impl.getMaximumVF(ElemWidth, Opcode);
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index b4847844cd0ee8..2c206094ac4a53 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -356,6 +356,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
 
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; }
+
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index f327d0cad426db..086a212ee65b70 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -635,6 +635,11 @@ unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const {
   return TTIImpl->getMinimumVF(ElemWidth);
 }
 
+unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth,
+                                           unsigned Opcode) const {
+  return TTIImpl->getMaximumVF(ElemWidth, Opcode);
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ab83419183ee7f..2cfb0299eea9c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -288,6 +288,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
   return 32;
 }
 
+unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
+    return 32 * 4 / ElemWidth;
+  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+}
+
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                          unsigned ChainSizeInBytes,
                                          VectorType *VecTy) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 05a054ac30ff2e..1785a2df2eba67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -170,6 +170,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   unsigned getNumberOfRegisters(unsigned RCID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                unsigned ChainSizeInBytes,
                                VectorType *VecTy) const;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1c1c6edf08cd1..cda43521e9ba1e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -126,6 +126,10 @@ static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
+static cl::opt<unsigned>
+MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
+    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
+
 static cl::opt<int>
 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
     cl::desc("Maximum depth of the lookup for consecutive stores."));
@@ -741,6 +745,12 @@ class BoUpSLP {
     return MinVecRegSize;
   }
 
+  unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+    unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
+      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
+    return MaxVF ? MaxVF : UINT_MAX;
+  }
+
   /// Check if homogeneous aggregate is isomorphic to some VectorType.
   /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
   /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
@@ -6191,6 +6201,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -7633,7 +7644,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallPtrSet<Value *, 16> VisitedInstrs;
-  unsigned MaxVecRegSize = R.getMaxVecRegSize();
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
@@ -7660,27 +7670,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       // Look for the next elements with the same type.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
-      Type *EltTy = (*IncIt)->getType();
-
-      assert(EltTy->isSized() &&
-             "Instructions should all be sized at this point");
-      TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
-      if (EltTS.isScalable()) {
-        // For now, just ignore vectorizing scalable types.
-        ++IncIt;
-        continue;
-      }
-
-      unsigned EltSize = EltTS.getFixedSize();
-      unsigned MaxNumElts = MaxVecRegSize / EltSize;
-      if (MaxNumElts < 2) {
-        ++IncIt;
-        continue;
-      }
-
       while (SameTypeIt != E &&
-             (*SameTypeIt)->getType() == EltTy &&
-             static_cast<unsigned>(SameTypeIt - IncIt) < MaxNumElts) {
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 514cf074908601..c7985b90f012fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -123,12 +123,18 @@ bb:
   ret <2 x i16> %ins.1
 }
 
-; FIXME: Should not vectorize
 define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @uadd_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -145,8 +151,15 @@ bb:
 define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @usub_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -163,8 +176,15 @@ bb:
 define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @sadd_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -181,8 +201,15 @@ bb:
 define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: @ssub_sat_v2i32(
 ; GCN-NEXT:  bb:
-; GCN-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]])
-; GCN-NEXT:    ret <2 x i32> [[TMP0]]
+; GCN-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0
+; GCN-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1
+; GCN-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0
+; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
+; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
+; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
+; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
 bb:
   %arg0.0 = extractelement <2 x i32> %arg0, i64 0
@@ -267,8 +294,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <4 x i16> [[TMP0]]
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
index 84d25fe79a27c6..7a8e08de4138b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
@@ -18,9 +18,9 @@ bb:
   ret <2 x half> %tmp5
 }
 
-; TODO: Should probably not really be vectorizing this
 ; GCN-LABEL: @round_v2f32(
-; GCN: call <2 x float> @llvm.round.v2f32
+; GCN: call float @llvm.round.f32(
+; GCN: call float @llvm.round.f32(
 define <2 x float> @round_v2f32(<2 x float> %arg) {
 bb:
   %tmp = extractelement <2 x float> %arg, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
index 466e83d0260afb..ce33efc951fcc4 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
@@ -1,7 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -S -slp-max-reg-size=32 < %s | FileCheck -check-prefix=MAX32 %s
-; RUN: opt -slp-vectorizer -S -slp-max-reg-size=256 < %s | FileCheck -check-prefix=MAX256 %s
-; RUN: opt -slp-vectorizer -S -slp-max-reg-size=1024 < %s | FileCheck -check-prefix=MAX1024 %s
+; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s | FileCheck -check-prefix=MAX32 %s
+; RUN: opt -slp-vectorizer -S -slp-max-vf=8 < %s | FileCheck -check-prefix=MAX256 %s
+; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s | FileCheck -check-prefix=MAX1024 %s
+; RUN: opt -slp-vectorizer -S < %s | FileCheck -check-prefix=MAX1024 %s
+
+; Make sure we do not vectorize to create PHI wider than requested.
+; On AMDGPU target wider vectorization will result in a higher register pressure,
+; spilling, or even inability to allocate registers.
 
 define void @phi_float32(half %hval, float %fval) {
 ; MAX32-LABEL: @phi_float32(
@@ -120,6 +125,7 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX32-NEXT:    [[PHI30:%.*]] = phi float [ [[I63]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ]
 ; MAX32-NEXT:    [[PHI31:%.*]] = phi float [ [[I65]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I65]], [[BB5]] ], [ [[I65]], [[BB1]] ]
 ; MAX32-NEXT:    [[PHI32:%.*]] = phi float [ [[I67]], [[BB3]] ], [ [[I67]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I67]], [[BB1]] ]
+; MAX32-NEXT:    store float [[PHI31]], float* undef, align 4
 ; MAX32-NEXT:    ret void
 ;
 ; MAX256-LABEL: @phi_float32(
@@ -296,6 +302,8 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX256-NEXT:    [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ]
 ; MAX256-NEXT:    [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ]
 ; MAX256-NEXT:    [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP157:%.*]] = extractelement <8 x float> [[TMP156]], i32 6
+; MAX256-NEXT:    store float [[TMP157]], float* undef, align 4
 ; MAX256-NEXT:    ret void
 ;
 ; MAX1024-LABEL: @phi_float32(
@@ -481,6 +489,8 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb2:
 ; MAX1024-NEXT:    [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP166:%.*]] = extractelement <32 x float> [[TMP165]], i32 30
+; MAX1024-NEXT:    store float [[TMP166]], float* undef, align 4
 ; MAX1024-NEXT:    ret void
 ;
 bb:
@@ -603,5 +613,6 @@ bb2:
   %phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ]
   %phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ]
   %phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ]
+  store float %phi31, float* undef
   ret void
 }