Skip to content

Commit b33fbba

Browse files
committed
Reland [SimplifyCFG] FoldBranchToCommonDest: lift use-restriction on bonus instructions
This was orginally committed in 2245fb8. but was immediately reverted in f3abd54 because of a PHI handling issue. Original commit message: 1. It doesn't make sense to enforce that the bonus instruction is only used once in it's basic block. What matters is whether those user instructions fit within our budget, sure, but that is another question. 2. It doesn't make sense to enforce that said bonus instructions are only used within their basic block. Perhaps the branch condition isn't using the value computed by said bonus instruction, and said bonus instruction is simply being calculated to be used in successors? So iff we can clone bonus instructions, to lift these restrictions, we just need to carefully update their external uses to use the new cloned instructions. Notably, this transform (even without this change) appears to be poison-unsafe as per alive2, but is otherwise (including the patch) legal. We don't introduce any new PHI nodes, but only "move" the instructions around, i'm not really seeing much potential for extra cost modelling for the transform, especially since now we allow at most one such bonus instruction by default. This causes the fold to fire +11.4% more (13216 -> 14725) as of vanilla llvm test-suite + RawSpeed. The motivational pattern is IEEE-754-2008 Binary16->Binary32 extension code: https://github.com/darktable-org/rawspeed/blob/ca57d77fb2ba81f21fc712cfac26e54f46406473/src/librawspeed/common/FloatingPoint.h#L115-L120 ^ that should be a switch, but it is not now: https://godbolt.org/z/bvja5v That being said, even thought this seemed like this would fix it: https://godbolt.org/z/xGq3TM apparently that fold is happening somewhere else afterall, so something else also has a similar 'artificial' restriction.
1 parent 4018806 commit b33fbba

File tree

6 files changed

+245
-235
lines changed

6 files changed

+245
-235
lines changed

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2779,15 +2779,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
27792779
// Ignore dbg intrinsics.
27802780
if (isa<DbgInfoIntrinsic>(I))
27812781
continue;
2782-
if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))
2782+
// I must be safe to execute unconditionally.
2783+
if (!isSafeToSpeculativelyExecute(&*I))
27832784
return Changed;
2784-
// I has only one use and can be executed unconditionally.
2785-
Instruction *User = dyn_cast<Instruction>(I->user_back());
2786-
if (User == nullptr || User->getParent() != BB)
2787-
return Changed;
2788-
// I is used in the same BB. Since BI uses Cond and doesn't have more slots
2789-
// to use any other instruction, User must be an instruction between next(I)
2790-
// and Cond.
27912785

27922786
// Account for the cost of duplicating this instruction into each
27932787
// predecessor.
@@ -2883,6 +2877,13 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
28832877
PBI->swapSuccessors();
28842878
}
28852879

2880+
// Before cloning instructions, notify the successor basic block that it
2881+
// is about to have a new predecessor. This will update PHI nodes,
2882+
// which will allow us to update live-out uses of bonus instructions.
2883+
if (BI->isConditional())
2884+
AddPredecessorToBlock(PBI->getSuccessor(0) == BB ? TrueDest : FalseDest,
2885+
PredBlock, BB, MSSAU);
2886+
28862887
// If we have bonus instructions, clone them into the predecessor block.
28872888
// Note that there may be multiple predecessor blocks, so we cannot move
28882889
// bonus instructions to a predecessor block.
@@ -2914,6 +2915,18 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
29142915
PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst);
29152916
NewBonusInst->takeName(&*BonusInst);
29162917
BonusInst->setName(BonusInst->getName() + ".old");
2918+
BonusInst->replaceUsesWithIf(NewBonusInst, [BB](Use &U) {
2919+
auto *User = cast<Instruction>(U.getUser());
2920+
// Ignore uses in the same block as the bonus instruction itself.
2921+
if (User->getParent() == BB)
2922+
return false;
2923+
// We can safely update external non-PHI uses.
2924+
if (!isa<PHINode>(User))
2925+
return true;
2926+
// For PHI nodes, don't touch incoming values for same block
2927+
// as the bonus instruction itself.
2928+
return cast<PHINode>(User)->getIncomingBlock(U) != BB;
2929+
});
29172930
}
29182931

29192932
// Clone Cond into the predecessor basic block, and or/and the
@@ -2955,7 +2968,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
29552968
(SuccFalseWeight + SuccTrueWeight) +
29562969
PredTrueWeight * SuccFalseWeight);
29572970
}
2958-
AddPredecessorToBlock(TrueDest, PredBlock, BB, MSSAU);
29592971
PBI->setSuccessor(0, TrueDest);
29602972
}
29612973
if (PBI->getSuccessor(1) == BB) {
@@ -2970,7 +2982,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
29702982
// FalseWeight is FalseWeight for PBI * FalseWeight for BI.
29712983
NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
29722984
}
2973-
AddPredecessorToBlock(FalseDest, PredBlock, BB, MSSAU);
29742985
PBI->setSuccessor(1, FalseDest);
29752986
}
29762987
if (NewWeights.size() == 2) {

llvm/test/CodeGen/Thumb2/mve-float16regloops.ll

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,36 +1077,34 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
10771077
; CHECK-NEXT: sub sp, #24
10781078
; CHECK-NEXT: cmp r3, #8
10791079
; CHECK-NEXT: blo.w .LBB16_12
1080-
; CHECK-NEXT: @ %bb.1: @ %if.then
1081-
; CHECK-NEXT: movs r7, #0
1082-
; CHECK-NEXT: cmp.w r7, r3, lsr #2
1080+
; CHECK-NEXT: @ %bb.1: @ %entry
1081+
; CHECK-NEXT: lsrs.w r12, r3, #2
10831082
; CHECK-NEXT: beq.w .LBB16_12
10841083
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
10851084
; CHECK-NEXT: ldrh r4, [r0]
1086-
; CHECK-NEXT: lsr.w r9, r3, #2
1087-
; CHECK-NEXT: ldrd r5, r12, [r0, #4]
1088-
; CHECK-NEXT: movs r3, #1
1085+
; CHECK-NEXT: movs r6, #1
1086+
; CHECK-NEXT: ldrd r5, r3, [r0, #4]
10891087
; CHECK-NEXT: sub.w r0, r4, #8
10901088
; CHECK-NEXT: and r8, r0, #7
10911089
; CHECK-NEXT: add.w r7, r0, r0, lsr #29
1092-
; CHECK-NEXT: asrs r6, r7, #3
1093-
; CHECK-NEXT: cmp r6, #1
1090+
; CHECK-NEXT: asr.w lr, r7, #3
1091+
; CHECK-NEXT: cmp.w lr, #1
10941092
; CHECK-NEXT: it gt
1095-
; CHECK-NEXT: asrgt r3, r7, #3
1093+
; CHECK-NEXT: asrgt r6, r7, #3
10961094
; CHECK-NEXT: add.w r7, r5, r4, lsl #1
1097-
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
1098-
; CHECK-NEXT: subs r3, r7, #2
1099-
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
1100-
; CHECK-NEXT: rsbs r3, r4, #0
1101-
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
1102-
; CHECK-NEXT: add.w r3, r12, #16
1095+
; CHECK-NEXT: subs r7, #2
1096+
; CHECK-NEXT: str r7, [sp, #20] @ 4-byte Spill
1097+
; CHECK-NEXT: rsbs r7, r4, #0
1098+
; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
1099+
; CHECK-NEXT: add.w r7, r3, #16
1100+
; CHECK-NEXT: str r6, [sp] @ 4-byte Spill
11031101
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
1104-
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
1102+
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
11051103
; CHECK-NEXT: b .LBB16_4
11061104
; CHECK-NEXT: .LBB16_3: @ %while.end
11071105
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
11081106
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
1109-
; CHECK-NEXT: subs.w r9, r9, #1
1107+
; CHECK-NEXT: subs.w r12, r12, #1
11101108
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
11111109
; CHECK-NEXT: vstrb.8 q0, [r2], #8
11121110
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
@@ -1117,16 +1115,16 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
11171115
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
11181116
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
11191117
; CHECK-NEXT: vldrw.u32 q0, [r1], #8
1120-
; CHECK-NEXT: ldrh.w lr, [r12, #14]
1121-
; CHECK-NEXT: ldrh.w r0, [r12, #12]
1118+
; CHECK-NEXT: ldrh.w lr, [r3, #14]
1119+
; CHECK-NEXT: ldrh r0, [r3, #12]
11221120
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
11231121
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
1124-
; CHECK-NEXT: ldrh.w r4, [r12, #10]
1125-
; CHECK-NEXT: ldrh.w r7, [r12, #8]
1126-
; CHECK-NEXT: ldrh.w r3, [r12, #6]
1127-
; CHECK-NEXT: ldrh.w r6, [r12, #4]
1128-
; CHECK-NEXT: ldrh.w r11, [r12, #2]
1129-
; CHECK-NEXT: ldrh.w r10, [r12]
1122+
; CHECK-NEXT: ldrh r4, [r3, #10]
1123+
; CHECK-NEXT: ldrh r7, [r3, #8]
1124+
; CHECK-NEXT: ldrh r6, [r3, #6]
1125+
; CHECK-NEXT: ldrh.w r9, [r3, #4]
1126+
; CHECK-NEXT: ldrh.w r11, [r3, #2]
1127+
; CHECK-NEXT: ldrh.w r10, [r3]
11301128
; CHECK-NEXT: vstrb.8 q0, [r1], #8
11311129
; CHECK-NEXT: vldrw.u32 q0, [r5]
11321130
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
@@ -1136,10 +1134,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
11361134
; CHECK-NEXT: adds r1, r5, #6
11371135
; CHECK-NEXT: vfma.f16 q0, q1, r11
11381136
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
1139-
; CHECK-NEXT: vfma.f16 q0, q1, r6
1137+
; CHECK-NEXT: vfma.f16 q0, q1, r9
11401138
; CHECK-NEXT: vldrw.u32 q1, [r1]
11411139
; CHECK-NEXT: add.w r1, r5, #10
1142-
; CHECK-NEXT: vfma.f16 q0, q1, r3
1140+
; CHECK-NEXT: vfma.f16 q0, q1, r6
11431141
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
11441142
; CHECK-NEXT: vfma.f16 q0, q1, r7
11451143
; CHECK-NEXT: vldrw.u32 q1, [r1]

llvm/test/CodeGen/Thumb2/mve-float32regloops.ll

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,36 +1049,34 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
10491049
; CHECK-NEXT: sub sp, #32
10501050
; CHECK-NEXT: cmp r3, #8
10511051
; CHECK-NEXT: blo.w .LBB16_12
1052-
; CHECK-NEXT: @ %bb.1: @ %if.then
1053-
; CHECK-NEXT: movs r7, #0
1054-
; CHECK-NEXT: cmp.w r7, r3, lsr #2
1052+
; CHECK-NEXT: @ %bb.1: @ %entry
1053+
; CHECK-NEXT: lsrs.w r12, r3, #2
10551054
; CHECK-NEXT: beq.w .LBB16_12
10561055
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
1057-
; CHECK-NEXT: ldrh r4, [r0]
1058-
; CHECK-NEXT: lsr.w r10, r3, #2
1059-
; CHECK-NEXT: ldrd r5, r12, [r0, #4]
1060-
; CHECK-NEXT: movs r3, #1
1061-
; CHECK-NEXT: sub.w r7, r4, #8
1062-
; CHECK-NEXT: add.w r0, r7, r7, lsr #29
1063-
; CHECK-NEXT: asrs r6, r0, #3
1064-
; CHECK-NEXT: cmp r6, #1
1056+
; CHECK-NEXT: ldrh r6, [r0]
1057+
; CHECK-NEXT: movs r4, #1
1058+
; CHECK-NEXT: ldrd r5, r10, [r0, #4]
1059+
; CHECK-NEXT: sub.w r3, r6, #8
1060+
; CHECK-NEXT: add.w r0, r3, r3, lsr #29
1061+
; CHECK-NEXT: asrs r7, r0, #3
1062+
; CHECK-NEXT: cmp r7, #1
10651063
; CHECK-NEXT: it gt
1066-
; CHECK-NEXT: asrgt r3, r0, #3
1067-
; CHECK-NEXT: add.w r0, r5, r4, lsl #2
1064+
; CHECK-NEXT: asrgt r4, r0, #3
1065+
; CHECK-NEXT: add.w r0, r5, r6, lsl #2
10681066
; CHECK-NEXT: sub.w r9, r0, #4
1069-
; CHECK-NEXT: rsbs r0, r4, #0
1070-
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
1071-
; CHECK-NEXT: and r3, r7, #7
1067+
; CHECK-NEXT: rsbs r0, r6, #0
1068+
; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill
1069+
; CHECK-NEXT: and r4, r3, #7
10721070
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
1073-
; CHECK-NEXT: add.w r0, r12, #32
1074-
; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill
1071+
; CHECK-NEXT: add.w r0, r10, #32
1072+
; CHECK-NEXT: str r6, [sp, #20] @ 4-byte Spill
10751073
; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill
1076-
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
1074+
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
10771075
; CHECK-NEXT: b .LBB16_4
10781076
; CHECK-NEXT: .LBB16_3: @ %while.end
10791077
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
10801078
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
1081-
; CHECK-NEXT: subs.w r10, r10, #1
1079+
; CHECK-NEXT: subs.w r12, r12, #1
10821080
; CHECK-NEXT: vstrb.8 q0, [r2], #16
10831081
; CHECK-NEXT: add.w r0, r5, r0, lsl #2
10841082
; CHECK-NEXT: add.w r5, r0, #16
@@ -1087,25 +1085,25 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
10871085
; CHECK-NEXT: @ =>This Loop Header: Depth=1
10881086
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
10891087
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
1088+
; CHECK-NEXT: add.w lr, r10, #8
10901089
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
1091-
; CHECK-NEXT: ldrd r7, r6, [r12]
1092-
; CHECK-NEXT: ldrd r0, r4, [r12, #8]
1093-
; CHECK-NEXT: ldrd r3, lr, [r12, #16]
1094-
; CHECK-NEXT: ldrd r11, r8, [r12, #24]
1090+
; CHECK-NEXT: ldrd r3, r7, [r10]
1091+
; CHECK-NEXT: ldm.w lr, {r0, r4, r6, lr}
1092+
; CHECK-NEXT: ldrd r11, r8, [r10, #24]
10951093
; CHECK-NEXT: vstrb.8 q0, [r9], #16
10961094
; CHECK-NEXT: vldrw.u32 q0, [r5], #32
10971095
; CHECK-NEXT: strd r9, r1, [sp, #24] @ 8-byte Folded Spill
10981096
; CHECK-NEXT: vldrw.u32 q1, [r5, #-28]
1099-
; CHECK-NEXT: vmul.f32 q0, q0, r7
1097+
; CHECK-NEXT: vmul.f32 q0, q0, r3
11001098
; CHECK-NEXT: vldrw.u32 q6, [r5, #-24]
11011099
; CHECK-NEXT: vldrw.u32 q4, [r5, #-20]
1102-
; CHECK-NEXT: vfma.f32 q0, q1, r6
1100+
; CHECK-NEXT: vfma.f32 q0, q1, r7
11031101
; CHECK-NEXT: vldrw.u32 q5, [r5, #-16]
11041102
; CHECK-NEXT: vfma.f32 q0, q6, r0
11051103
; CHECK-NEXT: vldrw.u32 q2, [r5, #-12]
11061104
; CHECK-NEXT: vfma.f32 q0, q4, r4
11071105
; CHECK-NEXT: vldrw.u32 q3, [r5, #-8]
1108-
; CHECK-NEXT: vfma.f32 q0, q5, r3
1106+
; CHECK-NEXT: vfma.f32 q0, q5, r6
11091107
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
11101108
; CHECK-NEXT: vfma.f32 q0, q2, lr
11111109
; CHECK-NEXT: vldrw.u32 q1, [r5, #-4]
@@ -1149,26 +1147,26 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
11491147
; CHECK-NEXT: .LBB16_8: @ %for.end
11501148
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
11511149
; CHECK-NEXT: ldrd r9, r1, [sp, #24] @ 8-byte Folded Reload
1152-
; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
1153-
; CHECK-NEXT: cmp.w r3, #0
1150+
; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
1151+
; CHECK-NEXT: cmp.w r4, #0
11541152
; CHECK-NEXT: beq .LBB16_3
11551153
; CHECK-NEXT: b .LBB16_9
11561154
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
11571155
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
1158-
; CHECK-NEXT: mov r6, r5
1159-
; CHECK-NEXT: mov lr, r3
1156+
; CHECK-NEXT: mov r3, r5
1157+
; CHECK-NEXT: mov lr, r4
11601158
; CHECK-NEXT: .LBB16_10: @ %while.body76
11611159
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
11621160
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
11631161
; CHECK-NEXT: ldr r0, [r7], #4
1164-
; CHECK-NEXT: vldrw.u32 q1, [r6], #4
1162+
; CHECK-NEXT: vldrw.u32 q1, [r3], #4
11651163
; CHECK-NEXT: subs.w lr, lr, #1
11661164
; CHECK-NEXT: vfma.f32 q0, q1, r0
11671165
; CHECK-NEXT: bne .LBB16_10
11681166
; CHECK-NEXT: b .LBB16_11
11691167
; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit
11701168
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
1171-
; CHECK-NEXT: add.w r5, r5, r3, lsl #2
1169+
; CHECK-NEXT: add.w r5, r5, r4, lsl #2
11721170
; CHECK-NEXT: b .LBB16_3
11731171
; CHECK-NEXT: .LBB16_12: @ %if.end
11741172
; CHECK-NEXT: add sp, #32

0 commit comments

Comments
 (0)