[X86] Enable call frame optimization ("mov to push") not only for opt…

…size (PR26325) The size savings are significant, and from what I can tell, both ICC and GCC do this. Differential Revision: http://reviews.llvm.org/D18573 llvm-svn: 264966
llvm · Mar 30, 2016 · 6596977 · 6596977
1 parent 0e450a5
commit 6596977
Show file tree

Hide file tree

Showing 35 changed files with 191 additions and 199 deletions.
diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -180,10 +180,6 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
   if (CannotReserveFrame)
     return true;
 
-  // Don't do this when not optimizing for size.
-  if (!MF.getFunction()->optForSize())
-    return false;
-
   unsigned StackAlign = TFL->getStackAlignment();
 
   int64_t Advantage = 0;

diff --git a/llvm/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/llvm/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
-; RUN:   grep asm-printer | grep 16
+; RUN:   grep asm-printer | grep 15
 ;
 ; It's possible to schedule this in 14 instructions by avoiding
 ; callee-save registers, but the scheduler isn't currently that

diff --git a/llvm/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/llvm/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -6,7 +6,14 @@ target triple = "i686-pc-linux-gnu"
 define i32 @main() {
 ; CHECK-LABEL: main:
 ; CHECK-NOT: ret
-; CHECK: subl $4, %{{.*}}
+; CHECK: subl $12, %esp
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: calll cexp
+; CHECK: addl $28, %esp
 ; CHECK: ret
 
 entry:

diff --git a/llvm/test/CodeGen/X86/atom-lea-sp.ll b/llvm/test/CodeGen/X86/atom-lea-sp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck -check-prefix=ATOM %s
-; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck %s
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux -no-x86-call-frame-opt | FileCheck %s
 
 declare void @use_arr(i8*)
 declare void @many_params(i32, i32, i32, i32, i32, i32)

diff --git a/llvm/test/CodeGen/X86/avx-intel-ocl.ll b/llvm/test/CodeGen/X86/avx-intel-ocl.ll
@@ -15,9 +15,10 @@ declare i32 @func_int(i32, i32)
 ; WIN64: ret
 
 ; X32-LABEL: testf16_inp
-; X32: movl    %eax, (%esp)
 ; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; Push is not deemed profitable if we're realigning the stack.
+; X32: {{pushl|movl}}   %eax
 ; X32: call
 ; X32: ret
 
@@ -114,8 +115,8 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 ; test functions with integer parameters
 ; pass parameters on stack for 32-bit platform
 ; X32-LABEL: test_int
-; X32: movl {{.*}}, 4(%esp)
-; X32: movl {{.*}}, (%esp)
+; X32: pushl {{.*}}
+; X32: pushl {{.*}}
 ; X32: call
 ; X32: addl {{.*}}, %eax
 

diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -15,7 +15,8 @@ declare i32 @func_int(i32, i32)
 
 ; X32-LABEL: testf16_inp
 ; X32: vaddps  {{.*}}, {{%zmm[0-1]}}
-; X32: movl    %eax, (%esp)
+; Push is not deemed profitable if we're realigning the stack.
+; X32: {{pushl|movl}}   %eax
 ; X32: call
 ; X32: ret
 
@@ -102,4 +103,4 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a
    %mask1 = xor <16 x i1> %cmp_res, %mask
    %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
    ret <16 x float> %c
-}
+}
diff --git a/llvm/test/CodeGen/X86/call-push.ll b/llvm/test/CodeGen/X86/call-push.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim -no-x86-call-frame-opt | FileCheck %s
 
         %struct.decode_t = type { i8, i8, i8, i8, i16, i8, i8, %struct.range_t** }
         %struct.range_t = type { float, float, i32, i32, i32, [0 x i8] }

diff --git a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -21,9 +21,11 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
 ; i386-NEXT: lahf
 ; i386-NEXT: movl %eax, [[FLAGS:%.*]]
 ; i386-NEXT: popl %eax
-; i386-NEXT: movl %edx, 4(%esp)
-; i386-NEXT: movl %eax, (%esp)
+; i386-NEXT: subl $8, %esp
+; i386-NEXT: pushl %edx
+; i386-NEXT: pushl %eax
 ; i386-NEXT: calll bar
+; i386-NEXT: addl $16, %esp
 ; i386-NEXT: movl [[FLAGS]], %eax
 ; i386-NEXT: addb $127, %al
 ; i386-NEXT: sahf

diff --git a/llvm/test/CodeGen/X86/coalescer-commute3.ll b/llvm/test/CodeGen/X86/coalescer-commute3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | grep mov | count 6
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 -no-x86-call-frame-opt | grep mov | count 6
 
 	%struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* }
 

diff --git a/llvm/test/CodeGen/X86/hipe-prologue.ll b/llvm/test/CodeGen/X86/hipe-prologue.ll
@@ -24,7 +24,7 @@ define {i32, i32} @test_basic(i32 %hp, i32 %p) {
 
 define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) {
   ; X32-Linux-LABEL:       test_basic_hipecc:
-  ; X32-Linux:       leal -156(%esp), %ebx
+  ; X32-Linux:       leal -140(%esp), %ebx
   ; X32-Linux-NEXT:  cmpl 76(%ebp), %ebx
   ; X32-Linux-NEXT:  jb .LBB1_1
 

diff --git a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: llc %s -o - -enable-shrink-wrap=true -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-apple-macosx"
 

diff --git a/llvm/test/CodeGen/X86/libcall-sret.ll b/llvm/test/CodeGen/X86/libcall-sret.ll
@@ -10,14 +10,25 @@ define void @test_sret_libcall(i128 %l, i128 %r) {
 ; CHECK-LABEL: test_sret_libcall:
 
   ; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
-  ; (aligned) place for the actual sret data is %esp + 40.
-; CHECK: leal 40(%esp), [[SRET_ADDR:%[a-z]+]]
-; CHECK: movl [[SRET_ADDR]], (%esp)
+  ; (aligned) place for the actual sret data is %esp + 20.
+; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]]
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl [[SRET_ADDR]]
+
 ; CHECK: calll __multi3
-; CHECK-DAG: movl 40(%esp), [[RES0:%[a-z]+]]
-; CHECK-DAG: movl 44(%esp), [[RES1:%[a-z]+]]
-; CHECK-DAG: movl 48(%esp), [[RES2:%[a-z]+]]
-; CHECK-DAG: movl 52(%esp), [[RES3:%[a-z]+]]
+
+; CHECK: addl $44, %esp
+; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
+; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]]
+; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]]
+; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]]
 ; CHECK-DAG: movl [[RES0]], var
 ; CHECK-DAG: movl [[RES1]], var+4
 ; CHECK-DAG: movl [[RES2]], var+8

diff --git a/llvm/test/CodeGen/X86/localescape.ll b/llvm/test/CodeGen/X86/localescape.ll
@@ -39,21 +39,19 @@ define void @print_framealloc_from_fp(i8* %fp) {
 
 ; X86-LABEL: print_framealloc_from_fp:
 ; X86: pushl   %esi
-; X86: subl    $8, %esp
-; X86: movl    16(%esp), %esi
-; X86: movl    Lalloc_func$frame_escape_0(%esi), %eax
-; X86: movl    %eax, 4(%esp)
-; X86: movl    $_str, (%esp)
+; X86: movl    8(%esp), %esi
+; X86: pushl   Lalloc_func$frame_escape_0(%esi)
+; X86: pushl   $_str
 ; X86: calll   _printf
-; X86: movl    Lalloc_func$frame_escape_1(%esi), %eax
-; X86: movl    %eax, 4(%esp)
-; X86: movl    $_str, (%esp)
+; X86: addl    $8, %esp
+; X86: pushl   Lalloc_func$frame_escape_1(%esi)
+; X86: pushl   $_str
 ; X86: calll   _printf
+; X86: addl    $8, %esp
 ; X86: movl    $42, Lalloc_func$frame_escape_1(%esi)
 ; X86: movl    $4, %eax
-; X86: movl    Lalloc_func$frame_escape_1(%esi,%eax), %eax
-; X86: movl    %eax, 4(%esp)
-; X86: movl    $_str, (%esp)
+; X86: pushl   Lalloc_func$frame_escape_1(%esi,%eax)
+; X86: pushl   $_str
 ; X86: calll   _printf
 ; X86: addl    $8, %esp
 ; X86: popl    %esi
@@ -132,12 +130,13 @@ define void @alloc_func_no_frameaddr() {
 ; X64: retq
 
 ; X86-LABEL: alloc_func_no_frameaddr:
-; X86: subl    $12, %esp
-; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 8
-; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 4
-; X86: movl $42, 8(%esp)
-; X86: movl $13, 4(%esp)
-; X86: movl $0, (%esp)
+; X86: subl    $8, %esp
+; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4
+; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0
+; X86: movl $42, 4(%esp)
+; X86: movl $13, (%esp)
+; X86: pushl $0
 ; X86: calll _print_framealloc_from_fp
-; X86: addl $12, %esp
+; X86: addl    $4, %esp
+; X86: addl    $8, %esp
 ; X86: retl
diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll
@@ -93,14 +93,10 @@ define i32 @test_lib_args(float %a, float %b) #0 {
 }
 
 ; CHECK-LABEL: test_fp128:
-; CHECK: movl    (%eax), %e[[CX:..]]
-; CHECK-NEXT: movl    4(%eax), %e[[DX:..]]
-; CHECK-NEXT: movl    8(%eax), %e[[SI:..]]
-; CHECK-NEXT: movl    12(%eax), %e[[AX:..]]
-; CHECK-NEXT: movl    %e[[AX]], 12(%esp)
-; CHECK-NEXT: movl    %e[[SI]], 8(%esp)
-; CHECK-NEXT: movl    %e[[DX]], 4(%esp)
-; CHECK-NEXT: movl    %e[[CX]], (%esp)
+; CHECK:      pushl   12(%eax)
+; CHECK-NEXT: pushl   8(%eax)
+; CHECK-NEXT: pushl   4(%eax)
+; CHECK-NEXT: pushl   (%eax)
 ; CHECK-NEXT: calll   __fixtfsi
 define i32 @test_fp128(fp128* %ptr) #0 {
   %v = load fp128, fp128* %ptr

diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll
@@ -6,9 +6,9 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
 define fastcc void @t1() nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:         subl $12, %esp
-; CHECK-NEXT:    movl $188, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, (%esp)
+; CHECK:         pushl $188
+; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    pushl $0
 ; CHECK-NEXT:    calll L_memset$stub
 ;
 entry:

diff --git a/llvm/test/CodeGen/X86/mingw-alloca.ll b/llvm/test/CodeGen/X86/mingw-alloca.ll
@@ -22,12 +22,12 @@ entry:
 ; COFF: andl $-16, %esp
 ; COFF: pushl %eax
 ; COFF: calll __alloca
-; COFF: movl	8028(%esp), %eax
+; COFF: movl	8012(%esp), %eax
 ; ELF: foo2:
 ; ELF: andl $-16, %esp
 ; ELF: pushl %eax
 ; ELF: calll _alloca
-; ELF: movl	8028(%esp), %eax
+; ELF: movl	8012(%esp), %eax
 	%A2 = alloca [2000 x i32], align 16		; <[2000 x i32]*> [#uses=1]
 	%A2.sub = getelementptr [2000 x i32], [2000 x i32]* %A2, i32 0, i32 0		; <i32*> [#uses=1]
 	call void @bar2( i32* %A2.sub, i32 %N )

diff --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=i686-windows -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
 ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
 ; RUN: llc < %s -mtriple=i686-windows -stackrealign -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
 
@@ -12,54 +13,31 @@ declare void @oneparam(i32 %a)
 declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
 declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
 
-; Here, we should have a reserved frame, so we don't expect pushes
+; We should get pushes for x86, even though there is a reserved call frame.
+; Make sure we don't touch x86-64, and that turning it off works.
 ; NORMAL-LABEL: test1:
-; NORMAL: subl    $16, %esp
-; NORMAL-NEXT: movl    $4, 12(%esp)
-; NORMAL-NEXT: movl    $3, 8(%esp)
-; NORMAL-NEXT: movl    $2, 4(%esp)
-; NORMAL-NEXT: movl    $1, (%esp)
-; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-define void @test1() {
-entry:
-  call void @good(i32 1, i32 2, i32 3, i32 4)
-  ret void
-}
-
-; We're optimizing for code size, so we should get pushes for x86,
-; even though there is a reserved call frame.
-; Make sure we don't touch x86-64
-; NORMAL-LABEL: test1b:
 ; NORMAL-NOT: subl {{.*}} %esp
 ; NORMAL: pushl   $4
 ; NORMAL-NEXT: pushl   $3
 ; NORMAL-NEXT: pushl   $2
 ; NORMAL-NEXT: pushl   $1
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $16, %esp
-; X64-LABEL: test1b:
+; X64-LABEL: test1:
 ; X64: movl    $1, %ecx
 ; X64-NEXT: movl    $2, %edx
 ; X64-NEXT: movl    $3, %r8d
 ; X64-NEXT: movl    $4, %r9d
 ; X64-NEXT: callq   good
-define void @test1b() optsize {
-entry:
-  call void @good(i32 1, i32 2, i32 3, i32 4)
-  ret void
-}
-
-; Same as above, but for minsize
-; NORMAL-LABEL: test1c:
-; NORMAL-NOT: subl {{.*}} %esp
-; NORMAL: pushl   $4
-; NORMAL-NEXT: pushl   $3
-; NORMAL-NEXT: pushl   $2
-; NORMAL-NEXT: pushl   $1
-; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-define void @test1c() minsize {
+; NOPUSH-LABEL: test1:
+; NOPUSH: subl    $16, %esp
+; NOPUSH-NEXT: movl    $4, 12(%esp)
+; NOPUSH-NEXT: movl    $3, 8(%esp)
+; NOPUSH-NEXT: movl    $2, 4(%esp)
+; NOPUSH-NEXT: movl    $1, (%esp)
+; NOPUSH-NEXT: call
+; NOPUSH-NEXT: addl $16, %esp
+define void @test1() {
 entry:
   call void @good(i32 1, i32 2, i32 3, i32 4)
   ret void

diff --git a/llvm/test/CodeGen/X86/phys-reg-local-regalloc.ll b/llvm/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
-; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
 ; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
 
 @.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]