Skip to content

Commit

Permalink
[X86] Enable the post-RA-scheduler for clang's default 32-bit cpu.
Browse files Browse the repository at this point in the history
For compilations with no explicit cpu specified, this exhibits
nice gains on Silvermont, with neutral performance on big cores.

Differential Revision: http://reviews.llvm.org/D19138

llvm-svn: 267809
  • Loading branch information
Mitch Bodart committed Apr 27, 2016
1 parent 86ace55 commit e60465d
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 12 deletions.
36 changes: 26 additions & 10 deletions llvm/lib/Target/X86/X86.td
Expand Up @@ -276,12 +276,28 @@ def : Proc<"pentium3", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE1, FeatureFXSR]>;
def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>;
def : Proc<"pentium-m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
def : Proc<"pentium4", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE2, FeatureFXSR]>;
def : Proc<"pentium4m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;

// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
// The intent is to enable it for pentium4 which is the current default
// processor in a vanilla 32-bit clang compilation when no specific
// architecture is specified. This generally gives a nice performance
// increase on silvermont, with largely neutral behavior on other
// contemporary large core processors.
// pentium-m, pentium4m, prescott and nocona are included as a preventative
// measure to avoid performance surprises, in case clang's default cpu
// changes slightly.

def : ProcessorModel<"pentium-m", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;

def : ProcessorModel<"pentium4", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE2, FeatureFXSR]>;

def : ProcessorModel<"pentium4m", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;

// Intel Quark.
def : Proc<"lakemont", []>;
Expand All @@ -292,10 +308,10 @@ def : ProcessorModel<"yonah", SandyBridgeModel,
FeatureFXSR, FeatureSlowBTMem]>;

// NetBurst.
def : Proc<"prescott",
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
FeatureFXSR, FeatureSlowBTMem]>;
def : Proc<"nocona", [
def : ProcessorModel<"prescott", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
FeatureFXSR, FeatureSlowBTMem]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
FeatureMMX,
Expand Down
12 changes: 10 additions & 2 deletions llvm/lib/Target/X86/X86Schedule.td
Expand Up @@ -633,8 +633,9 @@ def IIC_NOP : InstrItinClass;
// latencies. Since these latencies are not used for pipeline hazards,
// they do not need to be exact.
//
// The GenericModel contains no instruction itineraries.
def GenericModel : SchedMachineModel {
// The GenericX86Model contains no instruction itineraries
// and disables PostRAScheduler.
class GenericX86Model : SchedMachineModel {
let IssueWidth = 4;
let MicroOpBufferSize = 32;
let LoadLatency = 4;
Expand All @@ -643,6 +644,13 @@ def GenericModel : SchedMachineModel {
let CompleteModel = 0;
}

def GenericModel : GenericX86Model;

// Define a model with the PostRAScheduler enabled.
def GenericPostRAModel : GenericX86Model {
let PostRAScheduler = 1;
}

include "X86ScheduleAtom.td"
include "X86SchedSandyBridge.td"
include "X86SchedHaswell.td"
Expand Down
40 changes: 40 additions & 0 deletions llvm/test/CodeGen/X86/post-ra-sched.ll
@@ -0,0 +1,40 @@
; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s
; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s
; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s
; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s
; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s
;
; Verify that scheduling puts some distance between a load feeding into
; the address of another load, and that second load. This currently
; happens during the post-RA-scheduler, which should be enabled by
; default with the above specified cpus.

@ptrs = external global [0 x i32*], align 4
@idxa = common global i32 0, align 4
@idxb = common global i32 0, align 4
@res = common global i32 0, align 4

define void @addindirect() {
; CHECK-LABEL: addindirect:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: movl idxb, %ecx
; CHECK-NEXT: movl idxa, %eax
; CHECK-NEXT: movl ptrs(,%ecx,4), %ecx
; CHECK-NEXT: movl ptrs(,%eax,4), %eax
; CHECK-NEXT: movl (%ecx), %ecx
; CHECK-NEXT: addl (%eax), %ecx
; CHECK-NEXT: movl %ecx, res
; CHECK-NEXT: retl
entry:
%0 = load i32, i32* @idxa, align 4
%arrayidx = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %0
%1 = load i32*, i32** %arrayidx, align 4
%2 = load i32, i32* %1, align 4
%3 = load i32, i32* @idxb, align 4
%arrayidx1 = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %3
%4 = load i32*, i32** %arrayidx1, align 4
%5 = load i32, i32* %4, align 4
%add = add i32 %5, %2
store i32 %add, i32* @res, align 4
ret void
}

0 comments on commit e60465d

Please sign in to comment.