diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 68367b7f7a41ff..655ba941f7c579 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -2228,6 +2228,23 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) { return BinaryOperator::CreateFSubFMF(XZ, YW, &I); } + auto m_FaddRdx = [](Value *&Sum, Value *&Vec) { + return m_OneUse( + m_Intrinsic( + m_Value(Sum), m_Value(Vec))); + }; + Value *A0, *A1, *V0, *V1; + if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) && + V0->getType() == V1->getType()) { + // Difference of sums is sum of differences: + // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1 + Value *Sub = Builder.CreateFSubFMF(V0, V1, &I); + Value *Rdx = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_reduce_v2_fadd, + {A0->getType(), Sub->getType()}, {A0, Sub}, &I); + return BinaryOperator::CreateFSubFMF(Rdx, A1, &I); + } + if (Instruction *F = factorizeFAddFSub(I, Builder)) return F; diff --git a/llvm/test/Transforms/InstCombine/vector-reductions.ll b/llvm/test/Transforms/InstCombine/vector-reductions.ll index 5eac0e09414ca5..dfac8da97b3597 100644 --- a/llvm/test/Transforms/InstCombine/vector-reductions.ll +++ b/llvm/test/Transforms/InstCombine/vector-reductions.ll @@ -7,9 +7,9 @@ declare void @use_f32(float) define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_v4f32( -; CHECK-NEXT: [[R0:%.*]] = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) -; CHECK-NEXT: [[R1:%.*]] = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) -; CHECK-NEXT: [[R:%.*]] = fsub reassoc nsz float [[R0]], [[R1]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc nsz <4 x float> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]]) +; CHECK-NEXT: [[R:%.*]] = fsub reassoc nsz float [[TMP2]], [[A1:%.*]] ; CHECK-NEXT: ret float [[R]] ; %r0 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0) @@ -18,6 +18,8 @@ define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x flo ret float %r } +; negative test - fsub must allow reassociation + define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_v4f32_fmf( ; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) @@ -31,6 +33,8 @@ define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x ret float %r } +; negative test - extra uses could create extra instructions + define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_extra_use1( ; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) @@ -46,6 +50,8 @@ define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 ret float %r } +; negative test - extra uses could create extra instructions + define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_extra_use2( ; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) @@ -61,6 +67,8 @@ define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 ret float %r } +; negative test - can't reassociate different vector types + define float @diff_of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1, <8 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_type_mismatch( ; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index 90745f42afabc9..b4bfb4a0c3d731 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -240,10 +240,9 @@ define i32 @TestVectorsEqualFP_alt(float* noalias %Vec0, float* noalias %Vec1, f ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP1]]) -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; CHECK-NEXT: [[ADD_3:%.*]] = fsub fast float [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[ADD_3]], [[TOLERANCE:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP4]]) +; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32 ; CHECK-NEXT: ret i32 [[COND]] ;