Skip to content

Commit

Permalink
Fix PR23384 (part 3 of 3)
Browse files Browse the repository at this point in the history
Summary:
The patch makes instruction count the highest priority for
 LSR solution for X86 (previously registers had highest priority).

Reviewers: qcolombet

Differential Revision: http://reviews.llvm.org/D30562

From: Evgeny Stupachenko <evstupac@gmail.com>
llvm-svn: 304824
  • Loading branch information
evstupac committed Jun 6, 2017
1 parent 4d4cd8b commit 3b88291
Show file tree
Hide file tree
Showing 16 changed files with 107 additions and 90 deletions.
11 changes: 11 additions & 0 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Expand Up @@ -2178,6 +2178,17 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}

bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2) {
// X86 specific here are "instruction number 1st priority".
return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
C1.NumIVMuls, C1.NumBaseAdds,
C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
C2.NumIVMuls, C2.NumBaseAdds,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}

bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/X86/X86TargetTransformInfo.h
Expand Up @@ -101,6 +101,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
bool isLegalMaskedGather(Type *DataType);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Expand Up @@ -131,7 +131,7 @@ static cl::opt<bool> EnablePhiElim(

// The flag adds instruction count to solutions cost comparision.
static cl::opt<bool> InsnsCost(
"lsr-insns-cost", cl::Hidden, cl::init(false),
"lsr-insns-cost", cl::Hidden, cl::init(true),
cl::desc("Add instruction count to a LSR cost model"));

// Flag to choose how to narrow complex lsr solution
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \
; RUN: grep "asm-printer" | grep 35
; RUN: grep "asm-printer" | grep 33

target datalayout = "e-p:32:32"
define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
Expand Down
11 changes: 8 additions & 3 deletions llvm/test/CodeGen/X86/atom-fixup-lea3.ll
@@ -1,6 +1,8 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
; CHECK: addl ([[reg:%[a-z]+]])
; CHECK-NEXT: addl $4, [[reg]]
; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4)
; CHECK-NEXT: movl
; CHECK-NEXT: addl 4({{%[a-z]+}},[[reg:%[a-z]+]],4)
; CHECK-NEXT: incl

; Test for the FixupLEAs pre-emit pass.
; An LEA should NOT be substituted for the ADD instruction
Expand All @@ -20,7 +22,7 @@
; return sum;
;}

define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 {
define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %k, i32* nocapture %l, i32* nocapture %m, i32* nocapture %array2) #0 {
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.lr.ph, label %for.end
Expand All @@ -35,6 +37,9 @@ for.body: ; preds = %for.body, %for.body
%j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ]
%inc1 = add nsw i32 %j.09, 1
%arrayidx = getelementptr inbounds i32, i32* %array2, i32 %j.09
store i32 %0, i32* %m, align 4
store i32 %sum.010, i32* %m, align 4
store i32 %0, i32* %m, align 4
%1 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %1
store i32 %add, i32* %m, align 4
Expand Down
10 changes: 2 additions & 8 deletions llvm/test/CodeGen/X86/full-lsr.ll
@@ -1,16 +1,10 @@
; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s

define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
; ATOM: foo
; ATOM: addl
; ATOM: addl
; ATOM: leal

; CHECK: foo
; CHECK: addl
; CHECK: addl
; CHECK: addl
; CHECK: incl

entry:
%0 = icmp sgt i32 %N, 0 ; <i1> [#uses=1]
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/X86/hoist-spill.ll
Expand Up @@ -3,10 +3,8 @@
; Check no spills to the same stack slot after hoisting.
; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp)
; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp)

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
Expand Down
15 changes: 9 additions & 6 deletions llvm/test/CodeGen/X86/loop-strength-reduce4.ll
Expand Up @@ -4,16 +4,19 @@
; By starting the IV at -64 instead of 0, a cmp is eliminated,
; as the flags from the add can be used directly.

; STATIC: movl $-64, [[ECX:%e..]]
; STATIC: movl $-64, [[EAX:%e..]]

; STATIC: movl [[EAX:%e..]], _state+76([[ECX]])
; STATIC: addl $16, [[ECX]]
; STATIC: movl %{{.+}}, _state+76([[EAX]])
; STATIC: addl $16, [[EAX]]
; STATIC: jne

; In PIC mode the symbol can't be folded, so the change-compare-stride
; trick applies.
; The same for PIC mode.

; PIC: cmpl $64
; PIC: movl $-64, [[EAX:%e..]]

; PIC: movl %{{.+}}, 76(%{{.+}},[[EAX]])
; PIC: addl $16, [[EAX]]
; PIC: jne

@state = external global [0 x i32] ; <[0 x i32]*> [#uses=4]
@S = external global [0 x i32] ; <[0 x i32]*> [#uses=4]
Expand Down
78 changes: 39 additions & 39 deletions llvm/test/CodeGen/X86/madd.ll
Expand Up @@ -9,17 +9,17 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movl %edx, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB0_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqu (%rdi), %xmm2
; SSE2-NEXT: movdqu (%rsi), %xmm3
; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2
; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3
; SSE2-NEXT: pmaddwd %xmm2, %xmm3
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: addq $16, %rsi
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: addq $-8, %rax
; SSE2-NEXT: addq $8, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB0_1
; SSE2-NEXT: # BB#2: # %middle.block
; SSE2-NEXT: paddd %xmm0, %xmm1
Expand All @@ -34,17 +34,17 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly
; AVX2: # BB#0: # %entry
; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB0_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqu (%rsi), %xmm2
; AVX2-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2
; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2
; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: addq $16, %rsi
; AVX2-NEXT: addq $16, %rdi
; AVX2-NEXT: addq $-8, %rax
; AVX2-NEXT: addq $8, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB0_1
; AVX2-NEXT: # BB#2: # %middle.block
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Expand All @@ -60,17 +60,17 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movl %edx, %eax
; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0
; AVX512-NEXT: xorl %ecx, %ecx
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB0_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vmovdqu (%rsi), %xmm2
; AVX512-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2
; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2
; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX512-NEXT: addq $16, %rsi
; AVX512-NEXT: addq $16, %rdi
; AVX512-NEXT: addq $-8, %rax
; AVX512-NEXT: addq $8, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB0_1
; AVX512-NEXT: # BB#2: # %middle.block
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Expand Down Expand Up @@ -118,12 +118,13 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movl %edx, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqu (%rdi), %xmm2
; SSE2-NEXT: movdqu (%rsi), %xmm3
; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2
; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pmulhuw %xmm2, %xmm4
; SSE2-NEXT: pmullw %xmm2, %xmm3
Expand All @@ -132,9 +133,8 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: addq $16, %rsi
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: addq $-8, %rax
; SSE2-NEXT: addq $8, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # BB#2: # %middle.block
; SSE2-NEXT: paddd %xmm1, %xmm0
Expand All @@ -149,16 +149,16 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; AVX2: # BB#0: # %entry
; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB1_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: addq $16, %rsi
; AVX2-NEXT: addq $16, %rdi
; AVX2-NEXT: addq $-8, %rax
; AVX2-NEXT: addq $8, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB1_1
; AVX2-NEXT: # BB#2: # %middle.block
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
Expand All @@ -174,16 +174,16 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movl %edx, %eax
; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0
; AVX512-NEXT: xorl %ecx, %ecx
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB1_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX512-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: addq $16, %rsi
; AVX512-NEXT: addq $16, %rdi
; AVX512-NEXT: addq $-8, %rax
; AVX512-NEXT: addq $8, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB1_1
; AVX512-NEXT: # BB#2: # %middle.block
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
Expand Down Expand Up @@ -231,6 +231,7 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movl %edx, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
Expand Down Expand Up @@ -263,9 +264,8 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm2
; SSE2-NEXT: addq $16, %rsi
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: addq $-16, %rax
; SSE2-NEXT: addq $16, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # BB#2: # %middle.block
; SSE2-NEXT: paddd %xmm3, %xmm0
Expand All @@ -282,17 +282,17 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; AVX2: # BB#0: # %entry
; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB2_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vpmovsxbw (%rdi), %ymm2
; AVX2-NEXT: vpmovsxbw (%rsi), %ymm3
; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2
; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $16, %rsi
; AVX2-NEXT: addq $16, %rdi
; AVX2-NEXT: addq $-16, %rax
; AVX2-NEXT: addq $16, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB2_1
; AVX2-NEXT: # BB#2: # %middle.block
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
Expand All @@ -309,18 +309,18 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; AVX512: # BB#0: # %entry
; AVX512-NEXT: movl %edx, %eax
; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0
; AVX512-NEXT: xorl %ecx, %ecx
; AVX512-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB2_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vpmovsxbw (%rdi), %ymm2
; AVX512-NEXT: vpmovsxbw (%rsi), %ymm3
; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2
; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
; AVX512-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2
; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; AVX512-NEXT: addq $16, %rsi
; AVX512-NEXT: addq $16, %rdi
; AVX512-NEXT: addq $-16, %rax
; AVX512-NEXT: addq $16, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB2_1
; AVX512-NEXT: # BB#2: # %middle.block
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
Expand Down

0 comments on commit 3b88291

Please sign in to comment.