Skip to content

Commit

Permalink
[ARM] Move t2DoLoopStart reg alloc hint
Browse files Browse the repository at this point in the history
This adjusts the place that the t2DoLoopStart reg allocation hint is
inserted, adding it in the ARMTPAndVPTOptimizaionPass in a similar place
as other tail predicated loop optimizations. This removes the need for
doing so in a custom inserter, and should make the hint more accurate,
only adding it where we expect to create a DLS (not DLSTP or WLS).
  • Loading branch information
davemgreen committed Mar 11, 2021
1 parent fad70c3 commit bd516d2
Show file tree
Hide file tree
Showing 12 changed files with 95 additions and 66 deletions.
8 changes: 0 additions & 8 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Expand Up @@ -11386,14 +11386,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return EmitLowered__chkstk(MI, BB);
case ARM::WIN__DBZCHK:
return EmitLowered__dbzchk(MI, BB);
case ARM::t2DoLoopStart:
// We are just here to set a register allocation hint, prefering lr for the
// input register to make it more likely to be movable and removable, later
// in the pipeline.
Register R = MI.getOperand(1).getReg();
MachineFunction *MF = MI.getParent()->getParent();
MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
return BB;
}
}

Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/ARM/ARMInstrThumb2.td
Expand Up @@ -5459,7 +5459,6 @@ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {

// t2DoLoopStart a pseudo for DLS hardware loops. Lowered into a DLS in
// ARMLowOverheadLoops if possible, or reverted to a Mov if not.
let usesCustomInserter = 1 in
def t2DoLoopStart :
t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
[(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
Expand Up @@ -75,6 +75,7 @@ class MVETPAndVPTOptimisations : public MachineFunctionPass {
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
bool ConvertVPSEL(MachineBasicBlock &MBB);
bool HintDoLoopStartReg(MachineBasicBlock &MBB);
};

char MVETPAndVPTOptimisations::ID = 0;
Expand Down Expand Up @@ -946,6 +947,21 @@ bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
return !DeadInstructions.empty();
}

// Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
// the instruction may be removable as a noop.
bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
bool Changed = false;
for (MachineInstr &MI : MBB.instrs()) {
if (MI.getOpcode() != ARM::t2DoLoopStart)
continue;
Register R = MI.getOperand(1).getReg();
MachineFunction *MF = MI.getParent()->getParent();
MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
Changed = true;
}
return Changed;
}

bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
Expand All @@ -969,6 +985,7 @@ bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
}

for (MachineBasicBlock &MBB : Fn) {
Modified |= HintDoLoopStartReg(MBB);
Modified |= ReplaceConstByVPNOTs(MBB, DT);
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
Expand Up @@ -219,8 +219,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: add.w lr, r12, r3, lsr #2
; CHECK-NEXT: add.w r12, r12, r3, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
Expand Up @@ -152,7 +152,7 @@ if.end: ; preds = %while.body, %while.
; CHECK: be_ne
; CHECK: body:
; CHECK: bb.0.entry:
; CHECK: $lr = t2DLS killed renamable $r12
; CHECK: $lr =
; CHECK: bb.2.do.body:
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
Expand Down
15 changes: 10 additions & 5 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
Expand Up @@ -15,8 +15,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -90,8 +91,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -165,8 +167,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -240,8 +243,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -315,8 +319,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
35 changes: 22 additions & 13 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
Expand Up @@ -73,7 +73,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: add.w r3, r3, r12, lsr #3
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -145,7 +146,8 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur
; CHECK-NEXT: bic r3, r3, #15
; CHECK-NEXT: sub.w r12, r3, #16
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #4
; CHECK-NEXT: add.w r3, r3, r12, lsr #4
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r2
Expand Down Expand Up @@ -214,7 +216,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: add.w r3, r3, r12, lsr #3
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -285,7 +288,8 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur
; CHECK-NEXT: bic r3, r3, #15
; CHECK-NEXT: sub.w r12, r3, #16
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #4
; CHECK-NEXT: add.w r3, r3, r12, lsr #4
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r2
Expand Down Expand Up @@ -354,7 +358,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: add.w r3, r3, r12, lsr #3
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -413,19 +418,20 @@ for.cond.cleanup: ; preds = %middle.block, %entr
define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
; CHECK-LABEL: two_loops_mul_add_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: beq .LBB6_8
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: subs r6, r3, #4
; CHECK-NEXT: subs r7, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: add.w lr, r3, r6, lsr #2
; CHECK-NEXT: add.w r6, r3, r7, lsr #2
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: dls lr, r6
; CHECK-NEXT: .LBB6_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r3
Expand All @@ -445,8 +451,9 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q0, r3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r6, lsr #2
; CHECK-NEXT: add.w r3, r3, r7, lsr #2
; CHECK-NEXT: vmov.32 q0[0], r12
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB6_5: @ %vector.body46
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand All @@ -463,10 +470,10 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
; CHECK-NEXT: vaddv.u32 r12, q0
; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup7
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .LBB6_8:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%cmp35 = icmp eq i32 %N, 0
br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph
Expand Down Expand Up @@ -548,9 +555,10 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: subs r3, #8
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: add.w lr, r4, r3, lsr #3
; CHECK-NEXT: add.w r12, r4, r3, lsr #3
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov r4, r1
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB7_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -668,7 +676,8 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) {
; CHECK-NEXT: adds r0, r2, #3
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: subs r0, #4
; CHECK-NEXT: add.w lr, r12, r0, lsr #2
; CHECK-NEXT: add.w r0, r12, r0, lsr #2
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: .LBB8_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
Expand Up @@ -57,8 +57,9 @@ define i32 @bad(i32* readonly %x, i32* nocapture readonly %y, i32 %n) {
; CHECK-NEXT: subs r3, r2, r3
; CHECK-NEXT: add.w r12, r3, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
Expand Up @@ -15,8 +15,9 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -91,8 +92,9 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -161,8 +163,9 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down

0 comments on commit bd516d2

Please sign in to comment.