Skip to content

Commit

Permalink
ARM: Enable MachineScheduler and disable PostRAScheduler for swift.
Browse files Browse the repository at this point in the history
This is mostly done to disable the PostRAScheduler which optimizes for
instruction latencies which isn't a good fit for out-of-order
architectures. This also allows to leave out the itinerary table in
swift in favor of the SchedModel ones.

This change leads to performance improvements/regressions by as much as
10% in some benchmarks, in fact we loose 0.4% performance over the
llvm-testsuite for reasons that appear to be unknown or out of the
compilers control. rdar://20803802 documents the investigation of
these effects.

While it is probably a good idea to perform the same switch for the
other ARM out-of-order CPUs, I limited this change to swift as I cannot
perform the benchmark verification on the other CPUs.

Differential Revision: http://reviews.llvm.org/D10513

llvm-svn: 242500
  • Loading branch information
MatzeB committed Jul 17, 2015
1 parent fb2398d commit 2d8315f
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 1,069 deletions.
3 changes: 3 additions & 0 deletions llvm/include/llvm/MC/MCSchedule.h
Expand Up @@ -206,6 +206,9 @@ struct MCSchedModel {
/// scheduling class (itinerary class or SchedRW list).
bool isComplete() const { return CompleteModel; }

/// Return true if machine supports out of order execution.
bool isOutOfOrder() const { return MicroOpBufferSize > 1; }

unsigned getNumProcResourceKinds() const {
return NumProcResourceKinds;
}
Expand Down
1,038 changes: 0 additions & 1,038 deletions llvm/lib/Target/ARM/ARMScheduleSwift.td

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions llvm/lib/Target/ARM/ARMSubtarget.cpp
Expand Up @@ -319,8 +319,19 @@ bool ARMSubtarget::hasSinCos() const {
return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
}

bool ARMSubtarget::enableMachineScheduler() const {
// Enable the MachineScheduler before register allocation for out-of-order
// architectures where we do not use the PostRA scheduler anymore (for now
// restricted to swift).
return getSchedModel().isOutOfOrder() && isSwift();
}

// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
bool ARMSubtarget::enablePostRAScheduler() const {
// No need for PostRA scheduling on out of order CPUs (for now restricted to
// swift).
if (getSchedModel().isOutOfOrder() && isSwift())
return false;
return (!isThumb() || hasThumb2());
}

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/ARM/ARMSubtarget.h
Expand Up @@ -433,6 +433,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
/// compiler runtime or math libraries.
bool hasSinCos() const;

/// Returns true if machine scheduler should be enabled.
bool enableMachineScheduler() const override;

/// True for some subtargets at > -O0.
bool enablePostRAScheduler() const override;

Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/ARM/adv-copy-opt.ll
Expand Up @@ -11,25 +11,25 @@
; r0 = r0 / r2
; r1 = r1 / r3
;
; NOOPT: vmov [[B:d[0-9]+]], r2, r3
; NOOPT-NEXT: vmov [[A:d[0-9]+]], r0, r1
; NOOPT: vmov [[A:d[0-9]+]], r0, r1
; NOOPT-NEXT: vmov [[B:d[0-9]+]], r2, r3
; Move the low part of B into a register.
; Unfortunately, we cannot express that the 's' register is the low
; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
; NOOPT-NEXT: vmov [[B_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]]
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
; NOOPT-NEXT: vmov.32 [[RES]][1], [[RES_HIGH]]
; NOOPT-NEXT: vmov r0, r1, [[RES]]
; NOOPT-NEXT: bx lr
;
; OPT-NOT: vmov
; OPT: udiv r0, r0, r2
; OPT-NEXT: udiv r1, r1, r3
; OPT: udiv r1, r1, r3
; OPT-NEXT: udiv r0, r0, r2
; OPT-NEXT: bx lr
define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
entry:
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,14 +1,16 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
; dependency) when it isn't dependent on last CPSR defining instruction.
; rdar://8928208

define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
entry:
; CHECK-LABEL: t1:
; CHECK: muls [[REG:(r[0-9]+)]], r3, r2
; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2
; CHECK-CORTEX-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
; CHECK-SWIFT: muls [[REG2:(r[0-9]+)]], r1, r0
; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3
; CHECK-NEXT: muls r0, [[REG]], [[REG2]]
%0 = mul nsw i32 %a, %b
%1 = mul nsw i32 %c, %d
Expand All @@ -21,8 +23,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind {
entry:
; CHECK-LABEL: t2:
%tobool7 = icmp eq i32* %ptr2, null
br i1 %tobool7, label %while.end, label %while.body
br label %while.body

while.body:
; CHECK: while.body
Expand Down Expand Up @@ -55,8 +56,7 @@ while.end:
define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
entry:
; CHECK-LABEL: t3:
%tobool7 = icmp eq i32* %ptr2, null
br i1 %tobool7, label %while.end, label %while.body
br label %while.body

while.body:
; CHECK: while.body
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/ARM/cmpxchg-idioms.ll
Expand Up @@ -15,14 +15,14 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
; CHECK: bne [[LOOP]]

; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
; CHECK: movs r0, #1
; CHECK: dmb ish
; CHECK: movs r0, #1
; CHECK: bx lr

; CHECK: [[FAILED]]:
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
; CHECK: movs r0, #0
; CHECK: dmb ish
; CHECK: movs r0, #0
; CHECK: bx lr

%pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
Expand All @@ -34,8 +34,8 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
; CHECK-LABEL: test_return_bool:

; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
; CHECK: dmb ishst
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1

; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/ARM/test-sharedidx.ll
Expand Up @@ -20,8 +20,8 @@ entry:

for.body: ; preds = %entry, %for.body.3
; CHECK: %for.body
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09
%0 = load i8, i8* %arrayidx, align 1
Expand All @@ -42,8 +42,8 @@ for.end: ; preds = %for.body, %for.body

for.body.1: ; preds = %for.body
; CHECK: %for.body.1
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5
%2 = load i8, i8* %arrayidx.1, align 1
%conv6.1 = zext i8 %2 to i32
Expand All @@ -60,8 +60,8 @@ for.body.1: ; preds = %for.body

for.body.2: ; preds = %for.body.1
; CHECK: %for.body.2
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1
%4 = load i8, i8* %arrayidx.2, align 1
%conv6.2 = zext i8 %4 to i32
Expand All @@ -78,8 +78,8 @@ for.body.2: ; preds = %for.body.1

for.body.3: ; preds = %for.body.2
; CHECK: %for.body.3
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2
%6 = load i8, i8* %arrayidx.3, align 1
%conv6.3 = zext i8 %6 to i32
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/ARM/vector-load.ll
Expand Up @@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) {

define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0]
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16
;CHECK: str.w r[[INCREG]], [r0]
;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: str r[[INCREG]], [r0]
%A = load <4 x i8>*, <4 x i8>** %ptr
%lA = load <4 x i8>, <4 x i8>* %A, align 4
%inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/ARM/vector-store.ll
Expand Up @@ -228,9 +228,9 @@ define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) {
;CHECK: ldr.w r9, [sp]
;CHECK: vmov {{d[0-9]+}}, r3, r9
;CHECK: vmov {{d[0-9]+}}, r1, r2
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
%A = load <4 x i8>*, <4 x i8>** %ptr
%trunc = trunc <4 x i32> %val to <4 x i8>
Expand All @@ -243,10 +243,10 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val)
;CHECK: ldr.w r9, [sp]
;CHECK: vmov {{d[0-9]+}}, r3, r9
;CHECK: vmov {{d[0-9]+}}, r1, r2
;CHECK: movs [[IMM16:r[0-9]+]], #16
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: movs [[IMM16:r[0-9]+]], #16
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
;CHECK: str r[[PTRREG]], [r0]
%A = load <4 x i8>*, <4 x i8>** %ptr
Expand Down

0 comments on commit 2d8315f

Please sign in to comment.