diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 832147ebd0a96..0ac176ede8271 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -97,6 +97,11 @@ SinkInstsIntoLoop("sink-insts-to-avoid-spills", "register spills"), cl::init(false), cl::Hidden); +static cl::opt SinkIntoLoopLimit( + "machine-sink-loop-limit", + cl::desc("The maximum number of instructions considered for loop sinking."), + cl::init(50), cl::Hidden); + STATISTIC(NumSunk, "Number of machine instructions sunk"); STATISTIC(NumLoopSunk, "Number of machine instructions sunk into a loop"); STATISTIC(NumSplit, "Number of critical edges split"); @@ -468,7 +473,15 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) { // Walk the candidates in reverse order so that we start with the use // of a def-use chain, if there is any. + // TODO: Sort the candidates using a cost-model. + unsigned i = 0; for (auto It = Candidates.rbegin(); It != Candidates.rend(); ++It) { + if (i++ == SinkIntoLoopLimit) { + LLVM_DEBUG(dbgs() << "LoopSink: Limit reached of instructions to " + "be analysed."); + break; + } + MachineInstr *I = *It; if (!SinkIntoLoop(L, *I)) break; @@ -1243,6 +1256,10 @@ bool MachineSinking::SinkIntoLoop(MachineLoop *L, MachineInstr &I) { LLVM_DEBUG(dbgs() << "LoopSink: Not sinking, sink block is the preheader\n"); return false; } + if (SinkBlock->size() > SinkLoadInstsPerBlockThreshold) { + LLVM_DEBUG(dbgs() << "LoopSink: Not Sinking, block too large to analyse.\n"); + return false; + } LLVM_DEBUG(dbgs() << "LoopSink: Sinking instruction!\n"); SinkBlock->splice(SinkBlock->getFirstNonPHI(), Preheader, I); diff --git a/llvm/test/CodeGen/AArch64/loop-sink-limit.mir b/llvm/test/CodeGen/AArch64/loop-sink-limit.mir new file mode 100644 index 0000000000000..2d85f023f0e45 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/loop-sink-limit.mir @@ -0,0 +1,178 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=machine-sink -sink-insts-to-avoid-spills \ +# RUN: -machine-sink-loop-limit=1 -verify-machineinstrs %s -o - 2>&1 | \ +# RUN: FileCheck %s --check-prefix=SINK1 +# +# RUN: llc -mtriple aarch64 -run-pass=machine-sink -sink-insts-to-avoid-spills \ +# RUN: -machine-sink-loop-limit=2 -verify-machineinstrs %s -o - 2>&1 | \ +# RUN: FileCheck %s --check-prefix=SINK2 + +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + @A = external dso_local global [100 x i32], align 4 + %struct.A = type { i32, i32, i32, i32, i32, i32 } + + define i32 @do_sink_use_is_not_a_copy(i32 %n) { + entry: + %cmp63 = icmp sgt i32 %n, 0 + br i1 %cmp63, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @A, i64 0, i64 0), align 4 + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ %n, %entry ], [ %div, %for.body ] + ret i32 %sum.0.lcssa + + for.body: ; preds = %for.body, %for.body.preheader + %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %sum.065 = phi i32 [ %div, %for.body ], [ %n, %for.body.preheader ] + %div = sdiv i32 %sum.065, %0 + %lsr.iv.next = add i32 %lsr.iv, -1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + +... +--- +name: do_sink_use_is_not_a_copy +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: gpr32, preferred-register: '' } + - { id: 1, class: gpr32all, preferred-register: '' } + - { id: 2, class: gpr32sp, preferred-register: '' } + - { id: 3, class: gpr32, preferred-register: '' } + - { id: 4, class: gpr32all, preferred-register: '' } + - { id: 5, class: gpr32all, preferred-register: '' } + - { id: 6, class: gpr32common, preferred-register: '' } + - { id: 7, class: gpr32, preferred-register: '' } + - { id: 8, class: gpr64common, preferred-register: '' } + - { id: 9, class: gpr32, preferred-register: '' } + - { id: 10, class: gpr32, preferred-register: '' } + - { id: 11, class: gpr32, preferred-register: '' } +liveins: + - { reg: '$w0', virtual-reg: '%6' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; SINK1-LABEL: name: do_sink_use_is_not_a_copy + ; SINK1: bb.0.entry: + ; SINK1: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; SINK1: liveins: $w0 + ; SINK1: [[COPY:%[0-9]+]]:gpr32common = COPY $w0 + ; SINK1: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY]], 1, 0, implicit-def $nzcv + ; SINK1: Bcc 11, %bb.2, implicit $nzcv + ; SINK1: B %bb.1 + ; SINK1: bb.1.for.body.preheader: + ; SINK1: successors: %bb.3(0x80000000) + ; SINK1: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) @A + ; SINK1: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui killed [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) @A :: (dereferenceable load 4 from `i32* getelementptr inbounds ([100 x i32], [100 x i32]* @A, i64 0, i64 0)`) + ; SINK1: B %bb.3 + ; SINK1: bb.2.for.cond.cleanup: + ; SINK1: [[PHI:%[0-9]+]]:gpr32all = PHI [[COPY]], %bb.0, %4, %bb.3 + ; SINK1: $w0 = COPY [[PHI]] + ; SINK1: RET_ReallyLR implicit $w0 + ; SINK1: bb.3.for.body: + ; SINK1: successors: %bb.2(0x04000000), %bb.3(0x7c000000) + ; SINK1: [[PHI1:%[0-9]+]]:gpr32sp = PHI [[COPY]], %bb.1, %5, %bb.3 + ; SINK1: [[PHI2:%[0-9]+]]:gpr32 = PHI [[COPY]], %bb.1, %4, %bb.3 + ; SINK1: [[SDIVWr:%[0-9]+]]:gpr32 = SDIVWr [[PHI2]], [[LDRWui]] + ; SINK1: [[COPY1:%[0-9]+]]:gpr32all = COPY [[SDIVWr]] + ; SINK1: [[SUBSWri1:%[0-9]+]]:gpr32 = SUBSWri [[PHI1]], 1, 0, implicit-def $nzcv + ; SINK1: [[COPY2:%[0-9]+]]:gpr32all = COPY [[SUBSWri1]] + ; SINK1: Bcc 0, %bb.2, implicit $nzcv + ; SINK1: B %bb.3 + ; SINK2-LABEL: name: do_sink_use_is_not_a_copy + ; SINK2: bb.0.entry: + ; SINK2: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; SINK2: liveins: $w0 + ; SINK2: [[COPY:%[0-9]+]]:gpr32common = COPY $w0 + ; SINK2: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY]], 1, 0, implicit-def $nzcv + ; SINK2: Bcc 11, %bb.2, implicit $nzcv + ; SINK2: B %bb.1 + ; SINK2: bb.1.for.body.preheader: + ; SINK2: successors: %bb.3(0x80000000) + ; SINK2: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) @A + ; SINK2: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui killed [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) @A :: (dereferenceable load 4 from `i32* getelementptr inbounds ([100 x i32], [100 x i32]* @A, i64 0, i64 0)`) + ; SINK2: B %bb.3 + ; SINK2: bb.2.for.cond.cleanup: + ; SINK2: [[PHI:%[0-9]+]]:gpr32all = PHI [[COPY]], %bb.0, %4, %bb.3 + ; SINK2: $w0 = COPY [[PHI]] + ; SINK2: RET_ReallyLR implicit $w0 + ; SINK2: bb.3.for.body: + ; SINK2: successors: %bb.2(0x04000000), %bb.3(0x7c000000) + ; SINK2: [[PHI1:%[0-9]+]]:gpr32sp = PHI [[COPY]], %bb.1, %5, %bb.3 + ; SINK2: [[PHI2:%[0-9]+]]:gpr32 = PHI [[COPY]], %bb.1, %4, %bb.3 + ; SINK2: [[SDIVWr:%[0-9]+]]:gpr32 = SDIVWr [[PHI2]], [[LDRWui]] + ; SINK2: [[COPY1:%[0-9]+]]:gpr32all = COPY [[SDIVWr]] + ; SINK2: [[SUBSWri1:%[0-9]+]]:gpr32 = SUBSWri [[PHI1]], 1, 0, implicit-def $nzcv + ; SINK2: [[COPY2:%[0-9]+]]:gpr32all = COPY [[SUBSWri1]] + ; SINK2: Bcc 0, %bb.2, implicit $nzcv + ; SINK2: B %bb.3 + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $w0 + + %6:gpr32common = COPY $w0 + %7:gpr32 = SUBSWri %6, 1, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + successors: %bb.3(0x80000000) + + %8:gpr64common = ADRP target-flags(aarch64-page) @A + %9:gpr32 = LDRWui killed %8, target-flags(aarch64-pageoff, aarch64-nc) @A :: (dereferenceable load 4 from `i32* getelementptr inbounds ([100 x i32], [100 x i32]* @A, i64 0, i64 0)`) + B %bb.3 + + bb.2.for.cond.cleanup: + %1:gpr32all = PHI %6, %bb.0, %4, %bb.3 + $w0 = COPY %1 + RET_ReallyLR implicit $w0 + + bb.3.for.body: + successors: %bb.2(0x04000000), %bb.3(0x7c000000) + + %2:gpr32sp = PHI %6, %bb.1, %5, %bb.3 + %3:gpr32 = PHI %6, %bb.1, %4, %bb.3 + %10:gpr32 = SDIVWr %3, %9 + %4:gpr32all = COPY %10 + %11:gpr32 = SUBSWri %2, 1, 0, implicit-def $nzcv + %5:gpr32all = COPY %11 + Bcc 0, %bb.2, implicit $nzcv + B %bb.3 + +...