Skip to content

Commit

Permalink
ScheduleDAGInstrs: Add condjump deps to addSchedBarrierDeps()
Browse files Browse the repository at this point in the history
addSchedBarrierDeps() is supposed to add use operands to the ExitSU
node. The current implementation adds uses for calls/barrier instruction
and the MBB live-outs in all other cases. The use
operands of conditional jump instructions were missed.

Also added code to macrofusion to set the latencies between nodes to
zero to avoid problems with the fusing nodes lingering around in the
pending list now.

Differential Revision: https://reviews.llvm.org/D25140

llvm-svn: 286544
  • Loading branch information
MatzeB committed Nov 11, 2016
1 parent a89d8ff commit 325cd2c
Show file tree
Hide file tree
Showing 15 changed files with 66 additions and 66 deletions.
3 changes: 1 addition & 2 deletions llvm/include/llvm/CodeGen/MachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -1019,8 +1019,7 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI);

std::unique_ptr<ScheduleDAGMutation>
createMacroFusionDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI);
createMacroFusionDAGMutation(const TargetInstrInfo *TII);

std::unique_ptr<ScheduleDAGMutation>
createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
Expand Down
50 changes: 20 additions & 30 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1501,10 +1501,9 @@ namespace {
/// that may be fused by the processor into a single operation.
class MacroFusion : public ScheduleDAGMutation {
const TargetInstrInfo &TII;
const TargetRegisterInfo &TRI;
public:
MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)
: TII(TII), TRI(TRI) {}
MacroFusion(const TargetInstrInfo &TII)
: TII(TII) {}

void apply(ScheduleDAGInstrs *DAGInstrs) override;
};
Expand All @@ -1513,27 +1512,12 @@ class MacroFusion : public ScheduleDAGMutation {
namespace llvm {

std::unique_ptr<ScheduleDAGMutation>
createMacroFusionDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) {
return make_unique<MacroFusion>(*TII, *TRI);
createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
return make_unique<MacroFusion>(*TII);
}

} // namespace llvm

/// Returns true if \p MI reads a register written by \p Other.
static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,
const MachineInstr &Other) {
for (const MachineOperand &MO : MI.uses()) {
if (!MO.isReg() || !MO.readsReg())
continue;

unsigned Reg = MO.getReg();
if (Other.modifiesRegister(Reg, &TRI))
return true;
}
return false;
}

/// \brief Callback from DAG postProcessing to create cluster edges to encourage
/// fused operations.
void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
Expand All @@ -1545,16 +1529,12 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
if (!Branch)
return;

for (SUnit &SU : DAG->SUnits) {
// SUnits with successors can't be schedule in front of the ExitSU.
if (!SU.Succs.empty())
continue;
// We only care if the node writes to a register that the branch reads.
MachineInstr *Pred = SU.getInstr();
if (!HasDataDep(TRI, *Branch, *Pred))
for (SDep &PredDep : ExitSU.Preds) {
if (PredDep.isWeak())
continue;

if (!TII.shouldScheduleAdjacent(*Pred, *Branch))
SUnit &SU = *PredDep.getSUnit();
MachineInstr &Pred = *SU.getInstr();
if (!TII.shouldScheduleAdjacent(Pred, *Branch))
continue;

// Create a single weak edge from SU to ExitSU. The only effect is to cause
Expand All @@ -1567,6 +1547,16 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
(void)Success;
assert(Success && "No DAG nodes should be reachable from ExitSU");

// Adjust latency of data deps between the nodes.
for (SDep &PredDep : ExitSU.Preds) {
if (PredDep.getSUnit() == &SU)
PredDep.setLatency(0);
}
for (SDep &SuccDep : SU.Succs) {
if (SuccDep.getSUnit() == &ExitSU)
SuccDep.setLatency(0);
}

DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
break;
}
Expand Down Expand Up @@ -3128,7 +3118,7 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
}
if (EnableMacroFusion)
DAG->addMutation(createMacroFusionDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
return DAG;
}

Expand Down
11 changes: 4 additions & 7 deletions llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,8 @@ void ScheduleDAGInstrs::exitRegion() {
void ScheduleDAGInstrs::addSchedBarrierDeps() {
MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
ExitSU.setInstr(ExitMI);
bool AllDepKnown = ExitMI &&
(ExitMI->isCall() || ExitMI->isBarrier());
if (ExitMI && AllDepKnown) {
// If it's a call or a barrier, add dependencies on the defs and uses of
// instruction.
// Add dependencies on the defs and uses of the instruction.
if (ExitMI) {
for (const MachineOperand &MO : ExitMI->operands()) {
if (!MO.isReg() || MO.isDef()) continue;
unsigned Reg = MO.getReg();
Expand All @@ -261,10 +258,10 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
}
}
} else {
}
if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) {
// For others, e.g. fallthrough, conditional branch, assume the exit
// uses all the registers that are livein to the successor blocks.
assert(Uses.empty() && "Uses in set before adding deps?");
for (const MachineBasicBlock *Succ : BB->successors()) {
for (const auto &LI : Succ->liveins()) {
if (!Uses.contains(LI.PhysReg))
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ declare i32 @doSomething(i32, i32*)
; Next BB.
; CHECK: [[LOOP:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP]]
;
; Next BB.
Expand Down Expand Up @@ -144,8 +144,8 @@ declare i32 @something(...)
; Next BB.
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: ; %for.end
Expand Down Expand Up @@ -188,8 +188,8 @@ for.end: ; preds = %for.body
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: bl _somethingElse
Expand Down Expand Up @@ -259,8 +259,8 @@ declare void @somethingElse(...)
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: lsl w0, [[SUM]], #3
Expand Down Expand Up @@ -333,32 +333,32 @@ entry:
;
; Sum is merged with the returned register.
; CHECK: add [[VA_BASE:x[0-9]+]], sp, #16
; CHECK-NEXT: str [[VA_BASE]], [sp, #8]
; CHECK-NEXT: cmp w1, #1
; CHECK-NEXT: str [[VA_BASE]], [sp, #8]
; CHECK-NEXT: mov [[SUM:w0]], wzr
; CHECK-NEXT: b.lt [[IFEND_LABEL:LBB[0-9_]+]]
; CHECK: mov [[SUM:w0]], wzr
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: ldr [[VA_ADDR:x[0-9]+]], [sp, #8]
; CHECK-NEXT: add [[NEXT_VA_ADDR:x[0-9]+]], [[VA_ADDR]], #8
; CHECK-NEXT: str [[NEXT_VA_ADDR]], [sp, #8]
; CHECK-NEXT: ldr [[VA_VAL:w[0-9]+]], {{\[}}[[VA_ADDR]]]
; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
; CHECK-NEXT: sub w1, w1, #1
; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
; DISABLE-NEXT: b [[IFEND_LABEL]]
;
; DISABLE-NEXT: b
; DISABLE: [[ELSE_LABEL]]: ; %if.else
; DISABLE: lsl w0, w1, #1
;
; ENABLE: [[ELSE_LABEL]]: ; %if.else
; ENABLE: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; CHECK: [[IFEND_LABEL]]:
; Epilogue code.
; CHECK: add sp, sp, #16
; CHECK-NEXT: ret
;
; ENABLE: [[ELSE_LABEL]]: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE_NEXT: ret
define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
entry:
%ap = alloca i8*, align 8
Expand Down Expand Up @@ -413,9 +413,9 @@ declare void @llvm.va_end(i8*)
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; Inline asm statement.
; CHECK: add x19, x19, #1
; CHECK: sub [[IV]], [[IV]], #1
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; CHECK: add x19, x19, #1
; CHECK: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: mov w0, wzr
; Epilogue code.
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/misched-fusion.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
; RUN: llc -o - %s -mattr=+arith-cbz-fusion,+use-postra-scheduler | FileCheck %s
; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s
; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s

target triple = "arm64-apple-ios"
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AArch64/neg-imm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ if.then3:

for.inc:
; CHECK_LABEL: %for.inc
; CHECK: add
; CHECK-NEXT: cmp
; CHECK: b.le
; CHECK: cmp
; CHECK-NEXT: add
; CHECK-NEXT: b.le
; CHECK_LABEL: %for.cond.cleanup
%inc = add nsw i32 %x.015, 1
%cmp1 = icmp sgt i32 %x.015, %px
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,14 @@ bb3:
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]]
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]]
; GCN-NEXT: s_and_b64 vcc, exec, vcc

; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
; GCN-NEXT: v_nop_e64
; GCN-NEXT: v_nop_e64
; GCN-NEXT: ;;#ASMEND

; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]

; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
Expand Down Expand Up @@ -426,14 +426,15 @@ endif:
; GCN-NEXT: s_setpc_b64 vcc

; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body
; GCN: s_mov_b64 vcc, -1{{$}}
; GCN: ;;#ASMSTART
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: ;;#ASMEND
; GCN-NEXT: s_mov_b64 vcc, -1{{$}}
; GCN-NEXT: s_cbranch_vccz [[RET]]

; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
Expand Down Expand Up @@ -493,6 +494,7 @@ ret:

; GCN: [[LONG_BR_DEST0]]
; GCN: s_cmp_eq_u32
; GCN-NEXT: ; implicit-def
; GCN-NEXT: s_cbranch_scc0
; GCN: s_setpc_b64

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ declare i1 @llvm.amdgcn.class.f32(float, i32)
; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
bb0:
%tmp = icmp sgt i32 %arg1, 4
Expand All @@ -34,7 +34,7 @@ bb2:
; GCN-LABEL: {{^}}preserve_condition_undef_flag:
; GCN-NOT: vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
bb0:
%tmp = icmp sgt i32 %arg1, 4
Expand Down
8 changes: 7 additions & 1 deletion llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ entry:
; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0

; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: v_mov_b32_e32 v2, 2
; IDXMODE: v_mov_b32_e32 v3, 3
; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
Expand All @@ -95,6 +97,10 @@ entry:
; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0

; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: v_mov_b32_e32 v0,
; IDXMODE: v_mov_b32_e32 v1,
; IDXMODE: v_mov_b32_e32 v2,
; IDXMODE: v_mov_b32_e32 v3,
; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
Expand Down Expand Up @@ -572,12 +578,12 @@ bb7: ; preds = %bb4, %bb1
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
; GCN-DAG: s_load_dword [[ARG:s[0-9]+]]
; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16

; MOVREL-DAG: s_add_i32 m0, [[ARG]], -16
; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0
; GCN-NOT: m0

; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16
; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst
; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT0]], 4.0
; IDXMODE: s_set_gpr_idx_off
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ exit:

; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]

; CHECK-NEXT: ; BB#1: ; %bb
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/wqm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ END:
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: image_sample
;CHECK: store
;CHECK: v_cmp
;CHECK: store
define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,7 @@ declare double @llvm.pow.f64(double, double)
; CHECK: push
;
; DISABLE: tst{{(\.w)?}} r2, #1
; DISABLE-NEXT: vst1.64
; DISABLE-NEXT: beq [[BB13:LBB[0-9_]+]]
;
; CHECK: bl{{x?}} _pow
Expand Down
6 changes: 5 additions & 1 deletion llvm/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ for.end: ; preds = %for.body
; CHECK: mflr {{[0-9]+}}
;
; DISABLE: cmplwi 0, 3, 0
; DISABLE-NEXT: std
; DISABLE-NEXT: std
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
;
; Loop preheader
Expand Down Expand Up @@ -290,6 +292,8 @@ declare void @somethingElse(...)
; CHECK: mflr {{[0-9]+}}
;
; DISABLE: cmplwi 0, 3, 0
; DISABLE-NEXT: std
; DISABLE-NEXT: std
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
;
; CHECK: bl somethingElse
Expand Down Expand Up @@ -377,8 +381,8 @@ entry:
; ENABLE-DAG: li [[IV:[0-9]+]], 10
; ENABLE-DAG: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
;
; DISABLE: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
; DISABLE: cmplwi 0, 3, 0
; DISABLE-NEXT: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
; DISABLE: li [[IV:[0-9]+]], 10
;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ bb5: ; preds = %bb, %entry
define i32 @test_inlineasm(i32 %a) nounwind {
entry:
;CHECK-LABEL: test_inlineasm:
;CHECK: cmp
;CHECK: sethi
;CHECK: !NO_APP
;CHECK-NEXT: cmp
;CHECK-NEXT: ble
;CHECK-NEXT: mov
tail call void asm sideeffect "sethi 0, %g0", ""() nounwind
Expand Down

0 comments on commit 325cd2c

Please sign in to comment.