diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 842a19f4767187..41086e4abe6110 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -69,9 +69,11 @@ class RecurrenceDescriptor {
 
   RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
                        FastMathFlags FMF, Instruction *ExactFP, Type *RT,
-                       bool Signed, SmallPtrSetImpl<Instruction *> &CI)
+                       bool Signed, bool Ordered,
+                       SmallPtrSetImpl<Instruction *> &CI)
       : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
-        ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed) {
+        ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
+        IsOrdered(Ordered) {
     CastInsts.insert(CI.begin(), CI.end());
   }
 
@@ -228,6 +230,9 @@ class RecurrenceDescriptor {
   /// Returns true if all source operands of the recurrence are SExtInsts.
   bool isSigned() const { return IsSigned; }
 
+  /// Expose an ordered FP reduction to the instance users.
+  bool isOrdered() const { return IsOrdered; }
+
   /// Attempts to find a chain of operations from Phi to LoopExitInst that can
   /// be treated as a set of reductions instructions for in-loop reductions.
   SmallVector<Instruction *, 4> getReductionOpChain(PHINode *Phi,
@@ -250,6 +255,10 @@ class RecurrenceDescriptor {
   Type *RecurrenceType = nullptr;
   // True if all source operands of the recurrence are SExtInsts.
   bool IsSigned = false;
+  // True if this recurrence can be treated as an in-order reduction.
+  // Currently only a non-reassociative FAdd can be considered in-order,
+  // if it is also the only FAdd in the PHI's use chain.
+  bool IsOrdered = false;
   // Instructions used for type-promoting the recurrence.
   SmallPtrSet<Instruction *, 8> CastInsts;
 };
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index bf12941e5f4e19..067da68bd8de42 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -389,6 +389,11 @@ Value *createSimpleTargetReduction(IRBuilderBase &B,
 Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI,
                              RecurrenceDescriptor &Desc, Value *Src);
 
+/// Create an ordered reduction intrinsic using the given recurrence
+/// descriptor \p Desc.
+Value *createOrderedReduction(IRBuilderBase &B, RecurrenceDescriptor &Desc,
+                              Value *Src, Value *Start);
+
 /// Get the intersection (logical and) of all of the potential IR flags
 /// of each scalar operation (VL) that will be converted into a vector (I).
 /// If OpValue is non-null, we only consider operations similar to OpValue
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 8eaa6ef14596fc..3427150288b86b 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -189,6 +189,32 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
   }
 }
 
+// Check if a given Phi node can be recognized as an ordered reduction for
+// vectorizing floating point operations without unsafe math.
+static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
+                                  Instruction *Exit, PHINode *Phi) {
+  // Currently only FAdd is supported
+  if (Kind != RecurKind::FAdd)
+    return false;
+
+  bool IsOrdered =
+      Exit->getOpcode() == Instruction::FAdd && Exit == ExactFPMathInst;
+
+  // The only pattern accepted is the one in which the reduction PHI
+  // is used as one of the operands of the exit instruction
+  auto *LHS = Exit->getOperand(0);
+  auto *RHS = Exit->getOperand(1);
+  IsOrdered &= ((LHS == Phi) || (RHS == Phi));
+
+  if (!IsOrdered)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi
+                    << ", ExitInst: " << *Exit << "\n");
+
+  return true;
+}
+
 bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
                                            Loop *TheLoop, FastMathFlags FuncFMF,
                                            RecurrenceDescriptor &RedDes,
@@ -416,6 +442,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
+  const bool IsOrdered = checkOrderedReduction(
+      Kind, ReduxDesc.getExactFPMathInst(), ExitInstruction, Phi);
+
   if (Start != Phi) {
     // If the starting value is not the same as the phi node, we speculatively
     // looked through an 'and' instruction when evaluating a potential
@@ -470,7 +499,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // Save the description of this reduction variable.
   RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
                           ReduxDesc.getExactFPMathInst(), RecurrenceType,
-                          IsSigned, CastInsts);
+                          IsSigned, IsOrdered, CastInsts);
   RedDes = RD;
 
   return true;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index d2482fe6013698..7f1bbeb34d1437 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1070,6 +1070,17 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
   return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind());
 }
 
+Value *llvm::createOrderedReduction(IRBuilderBase &B,
+                                    RecurrenceDescriptor &Desc, Value *Src,
+                                    Value *Start) {
+  auto Kind = Desc.getRecurrenceKind();
+  assert(Kind == RecurKind::FAdd && "Unexpected reduction kind");
+  assert(Src->getType()->isVectorTy() && "Expected a vector type");
+  assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
+
+  return B.CreateFAddReduce(Start, Src);
+}
+
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
   auto *VecOp = dyn_cast<Instruction>(I);
   if (!VecOp)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 57f1b4e145c389..31efee728ddcd0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -330,6 +330,11 @@ static cl::opt<bool>
                            cl::desc("Prefer in-loop vector reductions, "
                                     "overriding the targets preference."));
 
+cl::opt<bool> EnableStrictReductions(
+    "enable-strict-reductions", cl::init(false), cl::Hidden,
+    cl::desc("Enable the vectorisation of loops with in-order (strict) "
+             "FP reductions"));
+
 static cl::opt<bool> PreferPredicatedReductionSelect(
     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
     cl::desc(
@@ -4259,6 +4264,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
 }
 
+static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+  return EnableStrictReductions && RdxDesc.isOrdered();
+}
+
 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
   // Get it's reduction variable descriptor.
   assert(Legal->isReductionVariable(Phi) &&
@@ -4288,6 +4297,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
     Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
+    if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc) &&
+        State.VF.isVector())
+      Val = State.get(State.Plan->getVPValue(LoopVal), UF - 1);
     cast<PHINode>(VecRdxPhi)
       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
   }
@@ -4379,7 +4391,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
   // terminate on this line. This is the easiest way to ensure we don't
   // accidentally cause an extra step back into the loop while debugging.
   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
-  {
+  if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc))
+    ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
+  else {
     // Floating-point operations should have some FMF to enable the reduction.
     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
@@ -6078,7 +6092,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
         if (!Legal->isReductionVariable(PN))
           continue;
         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
-        if (PreferInLoopReductions ||
+        if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
                                       RdxDesc.getRecurrenceType(),
                                       TargetTransformInfo::ReductionFlags()))
@@ -7657,7 +7671,7 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
     // If the target would prefer this reduction to happen "in-loop", then we
     // want to record it as such.
     unsigned Opcode = RdxDesc.getOpcode();
-    if (!PreferInLoopReductions &&
+    if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
                                    TargetTransformInfo::ReductionFlags()))
       continue;
@@ -9200,8 +9214,10 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 
 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
+  Value *PrevInChain = State.get(getChainOp(), 0);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     RecurKind Kind = RdxDesc->getRecurrenceKind();
+    bool IsOrdered = useOrderedReductions(*RdxDesc);
     Value *NewVecOp = State.get(getVecOp(), Part);
     if (VPValue *Cond = getCondOp()) {
       Value *NewCond = State.get(Cond, Part);
@@ -9213,15 +9229,23 @@ void VPReductionRecipe::execute(VPTransformState &State) {
       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
       NewVecOp = Select;
     }
-    Value *NewRed =
-        createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
-    Value *PrevInChain = State.get(getChainOp(), Part);
+    Value *NewRed;
     Value *NextInChain;
+    if (IsOrdered) {
+      NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
+                                      PrevInChain);
+      PrevInChain = NewRed;
+    } else {
+      PrevInChain = State.get(getChainOp(), Part);
+      NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+    }
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
       NextInChain =
           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
                          NewRed, PrevInChain);
-    } else {
+    } else if (IsOrdered)
+      NextInChain = NewRed;
+    else {
       NextInChain = State.Builder.CreateBinOp(
           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
           PrevInChain);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
new file mode 100644
index 00000000000000..ed16349c14bb30
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -0,0 +1,280 @@
+; RUN: opt < %s -loop-vectorize -instcombine -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions -S | FileCheck %s -check-prefix=CHECK
+
+define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
+; CHECK-LABEL: @fadd_strict
+; CHECK: vector.body:
+; CHECK: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK: %[[LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]])
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[PHI]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %add
+}
+
+define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
+; CHECK-LABEL: @fadd_strict_unroll
+; CHECK: vector.body:
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK: %[[RDX4:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK: ret float %[[PHI]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret float %add
+}
+
+define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_strict_interleave
+; CHECK: entry
+; CHECK: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK: %[[LOAD1:.*]] = load float, float* %a
+; CHECK: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK: vector.body
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
+; CHECK: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
+; CHECK: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]])
+; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]])
+; CHECK: for.end
+; CHECK ret void
+entry:
+  %arrayidxa = getelementptr inbounds float, float* %a, i64 1
+  %a1 = load float, float* %a, align 4
+  %a2 = load float, float* %arrayidxa, align 4
+  br label %for.body
+
+for.body:
+  %add.phi1 = phi float [ %a2, %entry ], [ %add2, %for.body ]
+  %add.phi2 = phi float [ %a1, %entry ], [ %add1, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidxb1 = getelementptr inbounds float, float* %b, i64 %iv
+  %0 = load float, float* %arrayidxb1, align 4
+  %add1 = fadd float %0, %add.phi2
+  %or = or i64 %iv, 1
+  %arrayidxb2 = getelementptr inbounds float, float* %b, i64 %or
+  %1 = load float, float* %arrayidxb2, align 4
+  %add2 = fadd float %1, %add.phi1
+  %iv.next = add nuw nsw i64 %iv, 2
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:
+  store float %add1, float* %a, align 4
+  store float %add2, float* %arrayidxa, align 4
+  ret void
+}
+
+define float @fadd_invariant(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_invariant
+; CHECK: vector.body
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]])
+; CHECK: for.end.loopexit
+; CHECK: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
+; CHECK: ret float %[[PHI]]
+entry:
+  %arrayidx = getelementptr inbounds float, float* %a, i64 1
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 5.000000e-01
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                      ; preds = %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res.014 = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %b, i64 %iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %1, %2
+  %rdx = fadd float %res.014, %add
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                 ; preds = %for.body, %entry
+  %res = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
+  ret float %res
+}
+
+define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_conditional
+; CHECK: vector.body:
+; CHECK: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ]
+; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
+; CHECK: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
+; CHECK: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
+; CHECK: pred.load.continue6
+; CHECK: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ]
+; CHECK: %[[PRED:.*]] = select <4 x i1> %[[FCMP1]], <4 x float> %[[PHI1]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]])
+; CHECK: for.body
+; CHECK: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK: %[[LOAD2:.*]] = load float, float*
+; CHECK: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
+; CHECK: br i1 %[[FCMP2]], label %if.then, label %for.inc
+; CHECK: if.then
+; CHECK: %[[LOAD3:.*]] = load float, float*
+; CHECK: br label %for.inc
+; CHECK: for.inc
+; CHECK: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]]
+; CHECK: for.end
+; CHECK: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RDX_PHI]]
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %res = phi float [ 1.000000e+00, %entry ], [ %fadd, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %tobool = fcmp une float %0, 0.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:                                      ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:
+  %phi = phi float [ %1, %if.then ], [ 3.000000e+00, %for.body ]
+  %fadd = fadd float %res, %phi
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:
+  %rdx = phi float [ %fadd, %for.inc ]
+  ret float %rdx
+}
+
+; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute
+define float @fadd_predicated(float* noalias nocapture %a, i64 %n) {
+; CHECK-LABEL: @fadd_predicated
+; CHECK: vector.ph
+; CHECK: %[[TRIP_MINUS_ONE:.*]] = add i64 %n, -1
+; CHECK: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0
+; CHECK: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK: vector.body
+; CHECK: %[[RDX_PHI:.*]] =  phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ]
+; CHECK: pred.load.continue2
+; CHECK: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ]
+; CHECK: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> <float -0.000000e+00, float -0.000000e+00>
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]])
+; CHECK: for.end:
+; CHECK: %[[RES_PHI:.*]] = phi float [ undef, %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RES_PHI]]
+entry:
+  br label %for.body
+
+for.body:                                           ; preds = %entry, %for.body
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %l7, %for.body ], [ 0.000000e+00, %entry ]
+  %l2 = getelementptr inbounds float, float* %a, i64 %iv
+  %l3 = load float, float* %l2, align 4
+  %l7 = fadd float %sum.02, %l3
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                            ; preds = %for.body
+  %sum.0.lcssa = phi float [ %l7, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; Negative test - loop contains multiple fadds which we cannot safely reorder
+define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
+; CHECK-LABEL: @fadd_multiple
+; CHECK: vector.body
+; CHECK: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK: middle.block
+; CHECK: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK: for.body
+; CHECK: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK: %[[LOAD1:.*]] = load float, float*
+; CHECK: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK: %[[LOAD2:.*]] = load float, float*
+; CHECK: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK: for.end
+; CHECK: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RET]]
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum = phi float [ -0.000000e+00, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %sum, %0
+  %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add3 = fadd float %add, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                         ; preds = %for.body
+  %rdx = phi float [ %add3, %for.body ]
+  ret float %rdx
+}
+
+!0 = distinct !{!0, !4, !7, !9}
+!1 = distinct !{!1, !4, !8, !9}
+!2 = distinct !{!2, !5, !7, !9}
+!3 = distinct !{!3, !6, !7, !9, !10}
+!4 = !{!"llvm.loop.vectorize.width", i32 8}
+!5 = !{!"llvm.loop.vectorize.width", i32 4}
+!6 = !{!"llvm.loop.vectorize.width", i32 2}
+!7 = !{!"llvm.loop.interleave.count", i32 1}
+!8 = !{!"llvm.loop.interleave.count", i32 4}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}