Skip to content

Commit

Permalink
[PowerPC] Duplicate inherited heuristic from base scheduler
Browse files Browse the repository at this point in the history
PowerPC has its custom scheduler heuristic. It calls parent classes'
tryCandidate in override version, but the function returns void, so this
way doesn't actually help. This patch duplicates code from base scheduler
into PPC machine scheduler class, which does what we wanted.

Reviewed By: steven.zhang

Differential Revision: https://reviews.llvm.org/D94464
  • Loading branch information
ecnelises committed Jan 22, 2021
1 parent b6c3a59 commit 449f2f7
Show file tree
Hide file tree
Showing 13 changed files with 478 additions and 344 deletions.
140 changes: 134 additions & 6 deletions llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
Expand Up @@ -49,10 +49,103 @@ bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
GenericScheduler::tryCandidate(Cand, TryCand, Zone);
// From GenericScheduler::tryCandidate

if (!Cand.isValid() || !Zone)
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return;
}

// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return;

// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return;

// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return;

// We only compare a subset of features when comparing nodes between
// Top and Bottom boundary. Some properties are simply incomparable, in many
// other instances we should only override the other boundary if something
// is a clear good pick on one boundary. Skip heuristics that are more
// "tie-breaking" in nature.
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// For loops that are acyclic path limited, aggressively schedule for
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
// heuristics to take precedence.
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
tryLatency(TryCand, Cand, *Zone))
return;

// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return;
}

// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
return;

if (SameBoundary) {
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return;
}

// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return;

if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return;

// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
return;

// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;
}
}

// GenericScheduler::tryCandidate end

// Add powerpc specific heuristic only when TryCand isn't selected or
// selected as node order.
Expand All @@ -61,8 +154,10 @@ void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,

// There are some benefits to schedule the ADDI before the load to hide the
// latency, as RA may create a true dependency between the load and addi.
if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
return;
if (SameBoundary) {
if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
return;
}
}

bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
Expand All @@ -79,11 +174,44 @@ bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,

void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) {
PostGenericScheduler::tryCandidate(Cand, TryCand);
// From PostGenericScheduler::tryCandidate

// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return;
}

// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return;

if (!Cand.isValid())
// Keep clustered nodes together.
if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster))
return;

// Avoid critical resource consumption and balance the schedule.
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return;

// Avoid serializing long latency dependence chains.
if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
return;
}

// Fall through to original instruction order.
if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
TryCand.Reason = NodeOrder;

// PostGenericScheduler::tryCandidate end

// Add powerpc post ra specific heuristic only when TryCand isn't selected or
// selected as node order.
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/PowerPC/botheightreduce.mir
Expand Up @@ -26,17 +26,17 @@ body: |
; CHECK: [[LI8_6:%[0-9]+]]:g8rc = LI8 7
; CHECK: bb.1:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1
; CHECK: [[LD:%[0-9]+]]:g8rc = LD 0, [[ADDI8_]] :: (load 8)
; CHECK: [[LDX:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_]] :: (load 8)
; CHECK: [[LDX1:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_3]] :: (load 8)
; CHECK: [[LD1:%[0-9]+]]:g8rc = LD 4, [[ADDI8_]] :: (load 8)
; CHECK: [[LDX2:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_4]] :: (load 8)
; CHECK: [[LDX3:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_5]] :: (load 8)
; CHECK: [[LDX4:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_6]] :: (load 8)
; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8)
; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]]
; CHECK: [[LD2:%[0-9]+]]:g8rc = LD 8, [[ADDI8_]] :: (load 8)
; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]]
; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8)
; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1
; CHECK: [[MULLD1:%[0-9]+]]:g8rc = MULLD [[MULLD]], [[LDX5]]
; CHECK: [[MULLD2:%[0-9]+]]:g8rc = MULLD [[MULLD1]], [[LDX1]]
; CHECK: [[MULLD3:%[0-9]+]]:g8rc = MULLD [[MULLD2]], [[LD1]]
Expand Down
66 changes: 32 additions & 34 deletions llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
Expand Up @@ -116,15 +116,14 @@ define i64 @test_ds_prep(i8* %0, i32 signext %1) {
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: ldx r9, r6, r7
; CHECK-NEXT: ld r10, 0(r6)
; CHECK-NEXT: ldx r11, r6, r5
; CHECK-NEXT: addi r8, r6, 1
; CHECK-NEXT: ld r6, 4(r6)
; CHECK-NEXT: mulld r9, r10, r9
; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: maddld r3, r9, r6, r3
; CHECK-NEXT: mr r6, r8
; CHECK-NEXT: ldx r8, r6, r7
; CHECK-NEXT: ld r9, 0(r6)
; CHECK-NEXT: ldx r10, r6, r5
; CHECK-NEXT: ld r11, 4(r6)
; CHECK-NEXT: addi r6, r6, 1
; CHECK-NEXT: mulld r8, r9, r8
; CHECK-NEXT: mulld r8, r8, r10
; CHECK-NEXT: maddld r3, r8, r11, r3
; CHECK-NEXT: bdnz .LBB1_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: add r3, r3, r4
Expand Down Expand Up @@ -217,25 +216,24 @@ define i64 @test_max_number_reminder(i8* %0, i32 signext %1) {
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: ldx r12, r9, r6
; CHECK-NEXT: ld r0, 0(r9)
; CHECK-NEXT: ldx r30, r9, r5
; CHECK-NEXT: ldx r29, r9, r7
; CHECK-NEXT: addi r11, r9, 1
; CHECK-NEXT: mulld r12, r0, r12
; CHECK-NEXT: ld r28, 4(r9)
; CHECK-NEXT: ldx r27, r9, r8
; CHECK-NEXT: ld r26, 12(r9)
; CHECK-NEXT: ld r25, 8(r9)
; CHECK-NEXT: ldx r9, r9, r10
; CHECK-NEXT: mulld r12, r12, r30
; CHECK-NEXT: mulld r12, r12, r29
; CHECK-NEXT: mulld r12, r12, r28
; CHECK-NEXT: mulld r12, r12, r27
; CHECK-NEXT: mulld r12, r12, r26
; CHECK-NEXT: mulld r12, r12, r25
; CHECK-NEXT: maddld r3, r12, r9, r3
; CHECK-NEXT: mr r9, r11
; CHECK-NEXT: ldx r11, r9, r6
; CHECK-NEXT: ld r12, 0(r9)
; CHECK-NEXT: ldx r0, r9, r5
; CHECK-NEXT: ldx r30, r9, r7
; CHECK-NEXT: mulld r11, r12, r11
; CHECK-NEXT: ld r29, 4(r9)
; CHECK-NEXT: ldx r28, r9, r8
; CHECK-NEXT: ld r27, 12(r9)
; CHECK-NEXT: ld r26, 8(r9)
; CHECK-NEXT: ldx r25, r9, r10
; CHECK-NEXT: addi r9, r9, 1
; CHECK-NEXT: mulld r11, r11, r0
; CHECK-NEXT: mulld r11, r11, r30
; CHECK-NEXT: mulld r11, r11, r29
; CHECK-NEXT: mulld r11, r11, r28
; CHECK-NEXT: mulld r11, r11, r27
; CHECK-NEXT: mulld r11, r11, r26
; CHECK-NEXT: maddld r3, r11, r25, r3
; CHECK-NEXT: bdnz .LBB2_2
; CHECK-NEXT: b .LBB2_4
; CHECK-NEXT: .LBB2_3:
Expand Down Expand Up @@ -624,10 +622,10 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: beq cr0, .LBB6_8
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
; CHECK-NEXT: cmpldi r4, 1
; CHECK-NEXT: li r7, 1
; CHECK-NEXT: addi r6, r3, 4009
; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
; CHECK-NEXT: ld r5, .LC0@toc@l(r5)
; CHECK-NEXT: iselgt r8, r4, r7
; CHECK-NEXT: lis r4, -21846
Expand All @@ -639,11 +637,11 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
; CHECK-NEXT: li r30, 1
; CHECK-NEXT: ld r5, 0(r5)
; CHECK-NEXT: mtctr r8
; CHECK-NEXT: li r8, -9
; CHECK-NEXT: addi r5, r5, -1
; CHECK-NEXT: ori r4, r4, 43691
; CHECK-NEXT: li r8, -9
; CHECK-NEXT: li r29, 1
; CHECK-NEXT: li r28, 1
; CHECK-NEXT: addi r5, r5, -1
; CHECK-NEXT: b .LBB6_4
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB6_2:
Expand All @@ -652,8 +650,8 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
; CHECK-NEXT: ld r0, -8(r6)
; CHECK-NEXT: add r29, r0, r29
; CHECK-NEXT: .LBB6_3:
; CHECK-NEXT: addi r6, r6, 1
; CHECK-NEXT: mulld r0, r29, r28
; CHECK-NEXT: addi r6, r6, 1
; CHECK-NEXT: mulld r0, r0, r30
; CHECK-NEXT: mulld r0, r0, r12
; CHECK-NEXT: mulld r0, r0, r11
Expand Down Expand Up @@ -802,8 +800,8 @@ define float @test_ds_float(i8* %0, i32 signext %1) {
; CHECK-NEXT: cmpwi r4, 1
; CHECK-NEXT: blt cr0, .LBB7_4
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addi r3, r3, 4002
; CHECK-NEXT: clrldi r4, r4, 32
; CHECK-NEXT: addi r3, r3, 4002
; CHECK-NEXT: xxlxor f1, f1, f1
; CHECK-NEXT: mtctr r4
; CHECK-NEXT: li r4, -1
Expand Down Expand Up @@ -884,8 +882,8 @@ define float @test_ds_combine_float_int(i8* %0, i32 signext %1) {
; CHECK-NEXT: cmpwi r4, 1
; CHECK-NEXT: blt cr0, .LBB8_4
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addi r3, r3, 4002
; CHECK-NEXT: clrldi r4, r4, 32
; CHECK-NEXT: addi r3, r3, 4002
; CHECK-NEXT: xxlxor f1, f1, f1
; CHECK-NEXT: mtctr r4
; CHECK-NEXT: li r4, -1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/PowerPC/lsr-ctrloop.ll
Expand Up @@ -14,9 +14,9 @@
define void @foo(float* nocapture %data, float %d) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xscvdpspn 0, 1
; CHECK-NEXT: li 5, 83
; CHECK-NEXT: addi 4, 3, 192
; CHECK-NEXT: xscvdpspn 0, 1
; CHECK-NEXT: mtctr 5
; CHECK-NEXT: xxspltw 0, 0, 0
; CHECK-NEXT: .p2align 4
Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
Expand Up @@ -400,9 +400,9 @@ define void @testcse4(<512 x i1>* %res, i32 %lim, <16 x i8>* %vc) {
; CHECK-NEXT: .LBB9_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: rldic r7, r6, 4, 28
; CHECK-NEXT: addi r6, r6, 6
; CHECK-NEXT: xxsetaccz acc2
; CHECK-NEXT: xxsetaccz acc1
; CHECK-NEXT: addi r6, r6, 6
; CHECK-NEXT: lxvx vs0, r5, r7
; CHECK-NEXT: add r7, r5, r7
; CHECK-NEXT: lxv vs1, 16(r7)
Expand All @@ -414,8 +414,8 @@ define void @testcse4(<512 x i1>* %res, i32 %lim, <16 x i8>* %vc) {
; CHECK-NEXT: lxv vs12, 64(r7)
; CHECK-NEXT: lxv vs13, 80(r7)
; CHECK-NEXT: rldic r7, r4, 6, 26
; CHECK-NEXT: addi r4, r4, 3
; CHECK-NEXT: xxsetaccz acc0
; CHECK-NEXT: addi r4, r4, 3
; CHECK-NEXT: xxmfacc acc1
; CHECK-NEXT: xvf32gernp acc0, vs12, vs13
; CHECK-NEXT: stxvx vs11, r3, r7
Expand Down Expand Up @@ -449,9 +449,9 @@ define void @testcse4(<512 x i1>* %res, i32 %lim, <16 x i8>* %vc) {
; CHECK-BE-NEXT: .LBB9_2: # %for.body
; CHECK-BE-NEXT: #
; CHECK-BE-NEXT: rldic r7, r6, 4, 28
; CHECK-BE-NEXT: addi r6, r6, 6
; CHECK-BE-NEXT: xxsetaccz acc2
; CHECK-BE-NEXT: xxsetaccz acc1
; CHECK-BE-NEXT: addi r6, r6, 6
; CHECK-BE-NEXT: lxvx vs0, r5, r7
; CHECK-BE-NEXT: add r7, r5, r7
; CHECK-BE-NEXT: lxv vs1, 16(r7)
Expand All @@ -463,8 +463,8 @@ define void @testcse4(<512 x i1>* %res, i32 %lim, <16 x i8>* %vc) {
; CHECK-BE-NEXT: lxv vs12, 64(r7)
; CHECK-BE-NEXT: lxv vs13, 80(r7)
; CHECK-BE-NEXT: rldic r7, r4, 6, 26
; CHECK-BE-NEXT: addi r4, r4, 3
; CHECK-BE-NEXT: xxsetaccz acc0
; CHECK-BE-NEXT: addi r4, r4, 3
; CHECK-BE-NEXT: xxmfacc acc1
; CHECK-BE-NEXT: xvf32gernp acc0, vs12, vs13
; CHECK-BE-NEXT: stxvx vs8, r3, r7
Expand Down Expand Up @@ -544,8 +544,7 @@ for.body: ; preds = %for.body, %for.body
declare i32 @testRedundantPrimeUnprimeF()
define void @testRedundantPrimeUnprime(<512 x i1>* %dst, <16 x i8> %vc) nounwind {
; CHECK-LABEL: testRedundantPrimeUnprime:
; CHECK: .localentry testRedundantPrimeUnprime, 1
; CHECK-NEXT: # %bb.0: # %entry
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mflr r0
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r0, 16(r1)
Expand Down

0 comments on commit 449f2f7

Please sign in to comment.