Skip to content

Commit

Permalink
[X86] Enable call frame optimization ("mov to push") not only for opt…
Browse files Browse the repository at this point in the history
…size (PR26325)

The size savings are significant, and from what I can tell, both ICC and GCC do this.

Differential Revision: http://reviews.llvm.org/D18573

llvm-svn: 264966
  • Loading branch information
zmodem committed Mar 30, 2016
1 parent 0e450a5 commit 6596977
Show file tree
Hide file tree
Showing 35 changed files with 191 additions and 199 deletions.
4 changes: 0 additions & 4 deletions llvm/lib/Target/X86/X86CallFrameOptimization.cpp
Expand Up @@ -180,10 +180,6 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
if (CannotReserveFrame)
return true;

// Don't do this when not optimizing for size.
if (!MF.getFunction()->optForSize())
return false;

unsigned StackAlign = TFL->getStackAlignment();

int64_t Advantage = 0;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
; RUN: grep asm-printer | grep 16
; RUN: grep asm-printer | grep 15
;
; It's possible to schedule this in 14 instructions by avoiding
; callee-save registers, but the scheduler isn't currently that
Expand Down
9 changes: 8 additions & 1 deletion llvm/test/CodeGen/X86/2006-11-12-CSRetCC.ll
Expand Up @@ -6,7 +6,14 @@ target triple = "i686-pc-linux-gnu"
define i32 @main() {
; CHECK-LABEL: main:
; CHECK-NOT: ret
; CHECK: subl $4, %{{.*}}
; CHECK: subl $12, %esp
; CHECK: pushl
; CHECK: pushl
; CHECK: pushl
; CHECK: pushl
; CHECK: pushl
; CHECK: calll cexp
; CHECK: addl $28, %esp
; CHECK: ret

entry:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/atom-lea-sp.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck -check-prefix=ATOM %s
; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck %s
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux -no-x86-call-frame-opt | FileCheck %s

declare void @use_arr(i8*)
declare void @many_params(i32, i32, i32, i32, i32, i32)
Expand Down
7 changes: 4 additions & 3 deletions llvm/test/CodeGen/X86/avx-intel-ocl.ll
Expand Up @@ -15,9 +15,10 @@ declare i32 @func_int(i32, i32)
; WIN64: ret

; X32-LABEL: testf16_inp
; X32: movl %eax, (%esp)
; X32: vaddps {{.*}}, {{%ymm[0-1]}}
; X32: vaddps {{.*}}, {{%ymm[0-1]}}
; Push is not deemed profitable if we're realigning the stack.
; X32: {{pushl|movl}} %eax
; X32: call
; X32: ret

Expand Down Expand Up @@ -114,8 +115,8 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; test functions with integer parameters
; pass parameters on stack for 32-bit platform
; X32-LABEL: test_int
; X32: movl {{.*}}, 4(%esp)
; X32: movl {{.*}}, (%esp)
; X32: pushl {{.*}}
; X32: pushl {{.*}}
; X32: call
; X32: addl {{.*}}, %eax

Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/X86/avx512-intel-ocl.ll
Expand Up @@ -15,7 +15,8 @@ declare i32 @func_int(i32, i32)

; X32-LABEL: testf16_inp
; X32: vaddps {{.*}}, {{%zmm[0-1]}}
; X32: movl %eax, (%esp)
; Push is not deemed profitable if we're realigning the stack.
; X32: {{pushl|movl}} %eax
; X32: call
; X32: ret

Expand Down Expand Up @@ -102,4 +103,4 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a
%mask1 = xor <16 x i1> %cmp_res, %mask
%c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
ret <16 x float> %c
}
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/call-push.ll
@@ -1,4 +1,4 @@
; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim | FileCheck %s
; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim -no-x86-call-frame-opt | FileCheck %s

%struct.decode_t = type { i8, i8, i8, i8, i16, i8, i8, %struct.range_t** }
%struct.range_t = type { float, float, i32, i32, i32, [0 x i8] }
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
Expand Up @@ -21,9 +21,11 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
; i386-NEXT: lahf
; i386-NEXT: movl %eax, [[FLAGS:%.*]]
; i386-NEXT: popl %eax
; i386-NEXT: movl %edx, 4(%esp)
; i386-NEXT: movl %eax, (%esp)
; i386-NEXT: subl $8, %esp
; i386-NEXT: pushl %edx
; i386-NEXT: pushl %eax
; i386-NEXT: calll bar
; i386-NEXT: addl $16, %esp
; i386-NEXT: movl [[FLAGS]], %eax
; i386-NEXT: addb $127, %al
; i386-NEXT: sahf
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/coalescer-commute3.ll
@@ -1,4 +1,4 @@
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | grep mov | count 6
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 -no-x86-call-frame-opt | grep mov | count 6

%struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* }

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/hipe-prologue.ll
Expand Up @@ -24,7 +24,7 @@ define {i32, i32} @test_basic(i32 %hp, i32 %p) {

define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) {
; X32-Linux-LABEL: test_basic_hipecc:
; X32-Linux: leal -156(%esp), %ebx
; X32-Linux: leal -140(%esp), %ebx
; X32-Linux-NEXT: cmpl 76(%ebp), %ebx
; X32-Linux-NEXT: jb .LBB1_1

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -1,5 +1,5 @@
; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
; RUN: llc %s -o - -enable-shrink-wrap=true -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
; RUN: llc %s -o - -enable-shrink-wrap=false -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386-apple-macosx"

Expand Down
25 changes: 18 additions & 7 deletions llvm/test/CodeGen/X86/libcall-sret.ll
Expand Up @@ -10,14 +10,25 @@ define void @test_sret_libcall(i128 %l, i128 %r) {
; CHECK-LABEL: test_sret_libcall:

; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
; (aligned) place for the actual sret data is %esp + 40.
; CHECK: leal 40(%esp), [[SRET_ADDR:%[a-z]+]]
; CHECK: movl [[SRET_ADDR]], (%esp)
; (aligned) place for the actual sret data is %esp + 20.
; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]]
; CHECK: pushl 72(%esp)
; CHECK: pushl 72(%esp)
; CHECK: pushl 72(%esp)
; CHECK: pushl 72(%esp)
; CHECK: pushl 72(%esp)
; CHECK: pushl 72(%esp)
; CHECK: pushl 72(%esp)
; CHECK: pushl 72(%esp)
; CHECK: pushl [[SRET_ADDR]]

; CHECK: calll __multi3
; CHECK-DAG: movl 40(%esp), [[RES0:%[a-z]+]]
; CHECK-DAG: movl 44(%esp), [[RES1:%[a-z]+]]
; CHECK-DAG: movl 48(%esp), [[RES2:%[a-z]+]]
; CHECK-DAG: movl 52(%esp), [[RES3:%[a-z]+]]

; CHECK: addl $44, %esp
; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]]
; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]]
; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]]
; CHECK-DAG: movl [[RES0]], var
; CHECK-DAG: movl [[RES1]], var+4
; CHECK-DAG: movl [[RES2]], var+8
Expand Down
35 changes: 17 additions & 18 deletions llvm/test/CodeGen/X86/localescape.ll
Expand Up @@ -39,21 +39,19 @@ define void @print_framealloc_from_fp(i8* %fp) {

; X86-LABEL: print_framealloc_from_fp:
; X86: pushl %esi
; X86: subl $8, %esp
; X86: movl 16(%esp), %esi
; X86: movl Lalloc_func$frame_escape_0(%esi), %eax
; X86: movl %eax, 4(%esp)
; X86: movl $_str, (%esp)
; X86: movl 8(%esp), %esi
; X86: pushl Lalloc_func$frame_escape_0(%esi)
; X86: pushl $_str
; X86: calll _printf
; X86: movl Lalloc_func$frame_escape_1(%esi), %eax
; X86: movl %eax, 4(%esp)
; X86: movl $_str, (%esp)
; X86: addl $8, %esp
; X86: pushl Lalloc_func$frame_escape_1(%esi)
; X86: pushl $_str
; X86: calll _printf
; X86: addl $8, %esp
; X86: movl $42, Lalloc_func$frame_escape_1(%esi)
; X86: movl $4, %eax
; X86: movl Lalloc_func$frame_escape_1(%esi,%eax), %eax
; X86: movl %eax, 4(%esp)
; X86: movl $_str, (%esp)
; X86: pushl Lalloc_func$frame_escape_1(%esi,%eax)
; X86: pushl $_str
; X86: calll _printf
; X86: addl $8, %esp
; X86: popl %esi
Expand Down Expand Up @@ -132,12 +130,13 @@ define void @alloc_func_no_frameaddr() {
; X64: retq

; X86-LABEL: alloc_func_no_frameaddr:
; X86: subl $12, %esp
; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 8
; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 4
; X86: movl $42, 8(%esp)
; X86: movl $13, 4(%esp)
; X86: movl $0, (%esp)
; X86: subl $8, %esp
; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4
; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0
; X86: movl $42, 4(%esp)
; X86: movl $13, (%esp)
; X86: pushl $0
; X86: calll _print_framealloc_from_fp
; X86: addl $12, %esp
; X86: addl $4, %esp
; X86: addl $8, %esp
; X86: retl
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/X86/mcu-abi.ll
Expand Up @@ -93,14 +93,10 @@ define i32 @test_lib_args(float %a, float %b) #0 {
}

; CHECK-LABEL: test_fp128:
; CHECK: movl (%eax), %e[[CX:..]]
; CHECK-NEXT: movl 4(%eax), %e[[DX:..]]
; CHECK-NEXT: movl 8(%eax), %e[[SI:..]]
; CHECK-NEXT: movl 12(%eax), %e[[AX:..]]
; CHECK-NEXT: movl %e[[AX]], 12(%esp)
; CHECK-NEXT: movl %e[[SI]], 8(%esp)
; CHECK-NEXT: movl %e[[DX]], 4(%esp)
; CHECK-NEXT: movl %e[[CX]], (%esp)
; CHECK: pushl 12(%eax)
; CHECK-NEXT: pushl 8(%eax)
; CHECK-NEXT: pushl 4(%eax)
; CHECK-NEXT: pushl (%eax)
; CHECK-NEXT: calll __fixtfsi
define i32 @test_fp128(fp128* %ptr) #0 {
%v = load fp128, fp128* %ptr
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/memset-2.ll
Expand Up @@ -6,9 +6,9 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
define fastcc void @t1() nounwind {
; CHECK-LABEL: t1:
; CHECK: subl $12, %esp
; CHECK-NEXT: movl $188, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, (%esp)
; CHECK: pushl $188
; CHECK-NEXT: pushl $0
; CHECK-NEXT: pushl $0
; CHECK-NEXT: calll L_memset$stub
;
entry:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/mingw-alloca.ll
Expand Up @@ -22,12 +22,12 @@ entry:
; COFF: andl $-16, %esp
; COFF: pushl %eax
; COFF: calll __alloca
; COFF: movl 8028(%esp), %eax
; COFF: movl 8012(%esp), %eax
; ELF: foo2:
; ELF: andl $-16, %esp
; ELF: pushl %eax
; ELF: calll _alloca
; ELF: movl 8028(%esp), %eax
; ELF: movl 8012(%esp), %eax
%A2 = alloca [2000 x i32], align 16 ; <[2000 x i32]*> [#uses=1]
%A2.sub = getelementptr [2000 x i32], [2000 x i32]* %A2, i32 0, i32 0 ; <i32*> [#uses=1]
call void @bar2( i32* %A2.sub, i32 %N )
Expand Down
48 changes: 13 additions & 35 deletions llvm/test/CodeGen/X86/movtopush.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
; RUN: llc < %s -mtriple=i686-windows -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=i686-windows -stackrealign -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED

Expand All @@ -12,54 +13,31 @@ declare void @oneparam(i32 %a)
declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)

; Here, we should have a reserved frame, so we don't expect pushes
; We should get pushes for x86, even though there is a reserved call frame.
; Make sure we don't touch x86-64, and that turning it off works.
; NORMAL-LABEL: test1:
; NORMAL: subl $16, %esp
; NORMAL-NEXT: movl $4, 12(%esp)
; NORMAL-NEXT: movl $3, 8(%esp)
; NORMAL-NEXT: movl $2, 4(%esp)
; NORMAL-NEXT: movl $1, (%esp)
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $16, %esp
define void @test1() {
entry:
call void @good(i32 1, i32 2, i32 3, i32 4)
ret void
}

; We're optimizing for code size, so we should get pushes for x86,
; even though there is a reserved call frame.
; Make sure we don't touch x86-64
; NORMAL-LABEL: test1b:
; NORMAL-NOT: subl {{.*}} %esp
; NORMAL: pushl $4
; NORMAL-NEXT: pushl $3
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $16, %esp
; X64-LABEL: test1b:
; X64-LABEL: test1:
; X64: movl $1, %ecx
; X64-NEXT: movl $2, %edx
; X64-NEXT: movl $3, %r8d
; X64-NEXT: movl $4, %r9d
; X64-NEXT: callq good
define void @test1b() optsize {
entry:
call void @good(i32 1, i32 2, i32 3, i32 4)
ret void
}

; Same as above, but for minsize
; NORMAL-LABEL: test1c:
; NORMAL-NOT: subl {{.*}} %esp
; NORMAL: pushl $4
; NORMAL-NEXT: pushl $3
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $16, %esp
define void @test1c() minsize {
; NOPUSH-LABEL: test1:
; NOPUSH: subl $16, %esp
; NOPUSH-NEXT: movl $4, 12(%esp)
; NOPUSH-NEXT: movl $3, 8(%esp)
; NOPUSH-NEXT: movl $2, 4(%esp)
; NOPUSH-NEXT: movl $1, (%esp)
; NOPUSH-NEXT: call
; NOPUSH-NEXT: addl $16, %esp
define void @test1() {
entry:
call void @good(i32 1, i32 2, i32 3, i32 4)
ret void
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck %s
; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -no-x86-call-frame-opt | FileCheck %s
; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.

@.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
Expand Down

0 comments on commit 6596977

Please sign in to comment.