Skip to content

Commit

Permalink
[InstCombine] fold reassociative FP add into start value of fadd redu…
Browse files Browse the repository at this point in the history
…ction

This pattern is visible in unrolled and vectorized loops.
Although the backend seems to be able to reassociate to
ideal form in the examples I looked at, we might as well
do that in IR for efficiency.
  • Loading branch information
rotateright committed Jul 18, 2021
1 parent 0590502 commit 0e15de2
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 21 deletions.
21 changes: 21 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1622,6 +1622,27 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
if (Instruction *F = factorizeFAddFSub(I, Builder))
return F;

// Try to fold fadd into start value of reduction intrinsic.
if (match(&I, m_c_FAdd(m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(
m_AnyZeroFP(), m_Value(X))),
m_Value(Y)))) {
// fadd (rdx 0.0, X), Y --> rdx Y, X
return replaceInstUsesWith(
I, Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
{X->getType()}, {Y, X}, &I));
}
const APFloat *StartC, *C;
if (match(LHS, m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(
m_APFloat(StartC), m_Value(X)))) &&
match(RHS, m_APFloat(C))) {
// fadd (rdx StartC, X), C --> rdx (C + StartC), X
Constant *NewStartC = ConstantFP::get(I.getType(), *C + *StartC);
return replaceInstUsesWith(
I, Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
{X->getType()}, {NewStartC, X}, &I));
}

if (Value *V = FAddCombine(Builder).simplify(&I))
return replaceInstUsesWith(I, V);
}
Expand Down
21 changes: 12 additions & 9 deletions llvm/test/Transforms/InstCombine/fadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -391,9 +391,8 @@ define float @fmul_fneg2_extra_use3(float %x, float %py, float %z) {

define float @fadd_rdx(float %x, <4 x float> %v) {
; CHECK-LABEL: @fadd_rdx(
; CHECK-NEXT: [[RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V:%.*]])
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[RDX]], [[X:%.*]]
; CHECK-NEXT: ret float [[ADD]]
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[X:%.*]], <4 x float> [[V:%.*]])
; CHECK-NEXT: ret float [[TMP1]]
;
%rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> %v)
%add = fadd fast float %rdx, %x
Expand All @@ -403,16 +402,17 @@ define float @fadd_rdx(float %x, <4 x float> %v) {
define float @fadd_rdx_commute(float %x, <4 x float> %v) {
; CHECK-LABEL: @fadd_rdx_commute(
; CHECK-NEXT: [[D:%.*]] = fdiv float 4.200000e+01, [[X:%.*]]
; CHECK-NEXT: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V:%.*]])
; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc nsz float [[D]], [[RDX]]
; CHECK-NEXT: ret float [[ADD]]
; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float [[D]], <4 x float> [[V:%.*]])
; CHECK-NEXT: ret float [[TMP1]]
;
%d = fdiv float 42.0, %x
%rdx = call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %v)
%add = fadd reassoc nsz float %d, %rdx
ret float %add
}

; Negative test - require nsz to be safer (and reassoc obviously).

define float @fadd_rdx_fmf(float %x, <4 x float> %v) {
; CHECK-LABEL: @fadd_rdx_fmf(
; CHECK-NEXT: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V:%.*]])
Expand All @@ -424,6 +424,8 @@ define float @fadd_rdx_fmf(float %x, <4 x float> %v) {
ret float %add
}

; Negative test - don't replace a single add with another reduction.

define float @fadd_rdx_extra_use(float %x, <4 x float> %v) {
; CHECK-LABEL: @fadd_rdx_extra_use(
; CHECK-NEXT: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V:%.*]])
Expand All @@ -439,15 +441,16 @@ define float @fadd_rdx_extra_use(float %x, <4 x float> %v) {

define float @fadd_rdx_nonzero_start_const_op(<4 x float> %v) {
; CHECK-LABEL: @fadd_rdx_nonzero_start_const_op(
; CHECK-NEXT: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 4.200000e+01, <4 x float> [[V:%.*]])
; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc ninf nsz float [[RDX]], -9.000000e+00
; CHECK-NEXT: ret float [[ADD]]
; CHECK-NEXT: [[TMP1:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float 3.300000e+01, <4 x float> [[V:%.*]])
; CHECK-NEXT: ret float [[TMP1]]
;
%rdx = call float @llvm.vector.reduce.fadd.v4f32(float 42.0, <4 x float> %v)
%add = fadd reassoc nsz ninf float %rdx, -9.0
ret float %add
}

; Negative test - we don't change the order of ops unless it saves an instruction.

define float @fadd_rdx_nonzero_start_variable_op(float %x, <4 x float> %v) {
; CHECK-LABEL: @fadd_rdx_nonzero_start_variable_op(
; CHECK-NEXT: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 4.200000e+01, <4 x float> [[V:%.*]])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ define i32 @add_v4i32(i32* %p) #0 {
; CHECK-LABEL: @add_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0:!tbaa !.*]]
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa [[TBAA0:![0-9]+]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
Expand Down Expand Up @@ -51,7 +51,7 @@ define signext i16 @mul_v8i16(i16* %p) #0 {
; CHECK-LABEL: @mul_v8i16(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[P:%.*]] to <8 x i16>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, [[TBAA4:!tbaa !.*]]
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, !tbaa [[TBAA4:![0-9]+]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <8 x i16> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand Down Expand Up @@ -95,7 +95,7 @@ define signext i8 @or_v16i8(i8* %p) #0 {
; CHECK-LABEL: @or_v16i8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to <16 x i8>*
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, [[TBAA6:!tbaa !.*]]
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, !tbaa [[TBAA6:![0-9]+]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = or <16 x i8> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <16 x i8> [[BIN_RDX]], <16 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand Down Expand Up @@ -141,7 +141,7 @@ define i32 @smin_v4i32(i32* %p) #0 {
; CHECK-LABEL: @smin_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0]]
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa [[TBAA0]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]]
Expand Down Expand Up @@ -195,7 +195,7 @@ define i32 @umax_v4i32(i32* %p) #0 {
; CHECK-LABEL: @umax_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0]]
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa [[TBAA0]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]]
Expand Down Expand Up @@ -249,15 +249,14 @@ define float @fadd_v4i32(float* %p) #0 {
; CHECK-LABEL: @fadd_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7:!tbaa !.*]]
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa [[TBAA7:![0-9]+]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0
; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast float -0.000000e+00, [[TMP2]]
; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[BIN_RDX5]], 4.200000e+01
; CHECK-NEXT: ret float [[OP_EXTRA]]
; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast float 4.200000e+01, [[TMP2]]
; CHECK-NEXT: ret float [[BIN_RDX5]]
;
entry:
br label %for.cond
Expand Down Expand Up @@ -290,7 +289,7 @@ define float @fmul_v4i32(float* %p) #0 {
; CHECK-LABEL: @fmul_v4i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa [[TBAA7]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul fast <4 x float> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
Expand Down Expand Up @@ -331,7 +330,7 @@ define float @fmin_v4f32(float* %p) #0 {
; CHECK-LABEL: @fmin_v4f32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa [[TBAA7]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP1]], <4 x float> [[RDX_SHUF]]
Expand Down Expand Up @@ -410,7 +409,7 @@ cond.end: ; preds = %cond.false, %cond.t
define float @findMax(<8 x float>* byval(<8 x float>) align 16 %0) {
; CHECK-LABEL: @findMax(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[V:%.*]] = load <8 x float>, <8 x float>* [[TMP0:%.*]], align 16, [[TBAA0]]
; CHECK-NEXT: [[V:%.*]] = load <8 x float>, <8 x float>* [[TMP0:%.*]], align 16, !tbaa [[TBAA0]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[V]], <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <8 x float> [[V]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[V]], <8 x float> [[RDX_SHUF]]
Expand Down

0 comments on commit 0e15de2

Please sign in to comment.