diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 007888e3a7f4b..b18aa6c4865e2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -395,13 +395,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { /// we always assume predicated blocks have a 50% chance of executing. static unsigned getReciprocalPredBlockProb() { return 2; } -/// A helper function that adds a 'fast' flag to floating-point operations. -static Value *addFastMathFlag(Value *V) { - if (isa(V)) - cast(V)->setFastMathFlags(FastMathFlags::getFast()); - return V; -} - /// A helper function that returns an integer or floating-point constant with /// value C. static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { @@ -2247,7 +2240,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // floating-point arithmetic as appropriate. Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); - Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); + Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); // Create a vector splat to use in the induction update. // @@ -2274,8 +2267,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, State, Part); - LastInduction = cast(addFastMathFlag( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } @@ -2407,6 +2400,11 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, } }; + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (ID.getInductionBinOp() && isa(ID.getInductionBinOp())) + Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); + // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); if (VF.isZero() || VF.isScalar()) { @@ -2486,23 +2484,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); // Add the consecutive indices to the vector value. + // Floating-point operations inherit FMF via the builder's flags. Constant *Cv = ConstantVector::get(Indices); - Step = Builder.CreateVectorSplat(VLen, Step); - - // Floating point operations had to be 'fast' to enable the induction. - FastMathFlags Flags; - Flags.setFast(); - Value *MulOp = Builder.CreateFMul(Cv, Step); - if (isa(MulOp)) - // Have to check, MulOp may be a constant - cast(MulOp)->setFastMathFlags(Flags); - - Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); - if (isa(BOp)) - cast(BOp)->setFastMathFlags(Flags); - return BOp; + return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, @@ -2547,15 +2533,15 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); if (ScalarIVTy->isFloatingPointTy()) StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); - StartIdx = addFastMathFlag(Builder.CreateBinOp( - AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); + StartIdx = Builder.CreateBinOp( + AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. assert((VF.isScalable() || isa(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable"); - auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); - auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); + auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); + auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); State.set(Def, Add, VPIteration(Part, Lane)); recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, Part, Lane); @@ -3325,6 +3311,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex( return LoopVectorBody->getTerminator(); return &*B.GetInsertPoint(); }; + switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(Index->getType() == StartValue->getType() && @@ -3352,22 +3339,9 @@ Value *InnerLoopVectorizer::emitTransformedIndex( "Original bin op should be defined for FP induction"); Value *StepValue = cast(Step)->getValue(); - - // Floating point operations had to be 'fast' to enable the induction. - FastMathFlags Flags; - Flags.setFast(); - Value *MulExp = B.CreateFMul(StepValue, Index); - if (isa(MulExp)) - // We have to check, the MulExp may be a constant. - cast(MulExp)->setFastMathFlags(Flags); - - Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, - "induction"); - if (isa(BOp)) - cast(BOp)->setFastMathFlags(Flags); - - return BOp; + return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, + "induction"); } case InductionDescriptor::IK_NoInduction: return nullptr; @@ -3454,6 +3428,11 @@ void InnerLoopVectorizer::createInductionResumeValues( EndValue = VectorTripCount; } else { IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + + // Fast-math-flags propagate from the original induction instruction. + if (II.getInductionBinOp() && isa(II.getInductionBinOp())) + B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = CastInst::getCastOpcode(VectorTripCount, true, StepType, true); @@ -3675,6 +3654,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, assert(isa(UI) && "Expected LCSSA form"); IRBuilder<> B(MiddleBlock->getTerminator()); + + // Fast-math-flags propagate from the original induction instruction. + if (II.getInductionBinOp() && isa(II.getInductionBinOp())) + B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + Value *CountMinusOne = B.CreateSub( CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); Value *CMO = @@ -7889,9 +7873,9 @@ Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, if (Ty->isFloatingPointTy()) { Constant *C = ConstantFP::get(Ty, (double)StartIdx); - // Floating point operations had to be 'fast' to enable the unrolling. - Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); - return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); + // Floating-point operations inherit FMF via the builder's flags. + Value *MulOp = Builder.CreateFMul(C, Step); + return Builder.CreateBinOp(BinOp, Val, MulOp); } Constant *C = ConstantInt::get(Ty, StartIdx); return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index c3215bcbb1a12..291c01efc3abe 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -105,8 +105,8 @@ for.end: ; preds = %for.end.loopexit, % ret void } -; FIXME: We do not need the full 'fast' FMF to vectorize the loop, but the code can't become -; 'fast' spontaneously. Something is wrong with FMF expectations/propagation. +; We do not need the full 'fast' FMF to vectorize the loop, but the code can't become +; 'fast' spontaneously - FMF should propagate from the original IR. define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i32 %N) { ; VEC4_INTERL1-LABEL: @fp_iv_loop1_reassoc_FMF( @@ -123,15 +123,15 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588 ; VEC4_INTERL1-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], [[CAST_CRD]] -; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP3]] +; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_CRD]] +; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 +; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP4]] +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -142,7 +142,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i ; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* ; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP7]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT5]] +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fsub reassoc <4 x float> [[VEC_IND]], [[DOTSPLAT5]] ; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC4_INTERL1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; VEC4_INTERL1: middle.block: @@ -181,22 +181,22 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934584 ; VEC4_INTERL2-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], [[CAST_CRD]] -; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP3]] +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_CRD]] +; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i32 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i32 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 +; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP4]] +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT5]] +; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub reassoc <4 x float> [[VEC_IND]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] ; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* ; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP7]], align 4 @@ -204,7 +204,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i ; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP9]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT5]] +; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub reassoc <4 x float> [[STEP_ADD]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC4_INTERL2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; VEC4_INTERL2: middle.block: @@ -265,15 +265,15 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590 ; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], [[CAST_CRD]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP3]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_CRD]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i32 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i32 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT2]], <2 x float> poison, <2 x i32> zeroinitializer -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[DOTSPLAT3]], -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fsub fast <2 x float> [[DOTSPLAT]], [[TMP4]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = fmul fast float [[FPINC]], 2.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fmul reassoc <2 x float> [[DOTSPLAT3]], +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fsub reassoc <2 x float> [[DOTSPLAT]], [[TMP4]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], 2.000000e+00 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT4]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] @@ -284,7 +284,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* ; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], <2 x float>* [[TMP7]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT]] = fsub fast <2 x float> [[VEC_IND]], [[DOTSPLAT5]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT]] = fsub reassoc <2 x float> [[VEC_IND]], [[DOTSPLAT5]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; VEC2_INTERL1_PRED_STORE: middle.block: