[X86] Teach X86FloatingPoint's handleCall to only erase the FP stack …

…if there is a regmask operand that clobbers the FP stack. There are some calls to functions like `__alloca` that are missing a regmask operand. Lack of a regmask operand means that all registers that aren't mentioned by def operands are preserved. __alloca only updates EAX and ESP and has def operands for them so this is ok. Because there is no regmask the register allocator won't spill the FP registers across the call. Assuming we want to keep the FP stack untoched across these calls, we need to handle this is in the FP stackifier. We might want to add a proper regmask operand to the code that creates these calls to indicate all registers are preserved, but we'd still need this change to the FP stackifier to know to preserve the FP stack for such a regmask. The test is kind of long, but bugpoint wasn't able to reduce it any further. Fixes PR50782 Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D105762
llvm · Jul 12, 2021 · d5c97f4 · d5c97f4
1 parent 46580d4
commit d5c97f4
Show file tree

Hide file tree

Showing 2 changed files with 145 additions and 0 deletions.
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -982,8 +982,24 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
   MachineInstr &MI = *I;
   unsigned STReturns = 0;
 
+  bool ClobbersFPStack = false;
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &Op = MI.getOperand(i);
+    // Check if this call clobbers the FP stack.
+    // is sufficient.
+    if (Op.isRegMask()) {
+      bool ClobbersFP0 = Op.clobbersPhysReg(X86::FP0);
+#ifndef NDEBUG
+      static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
+      for (unsigned i = 1; i != 8; ++i)
+        assert(Op.clobbersPhysReg(X86::FP0 + i) == ClobbersFP0 &&
+               "Inconsistent FP register clobber");
+#endif
+
+      if (ClobbersFP0)
+        ClobbersFPStack = true;
+    }
+
     if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
       continue;
 
@@ -998,6 +1014,14 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
     --e;
   }
 
+  // Most calls should have a regmask that clobbers the FP registers. If it
+  // isn't present then the register allocator didn't spill the FP registers
+  // so they are still on the stack.
+  assert((ClobbersFPStack || STReturns == 0) &&
+         "ST returns without FP stack clobber");
+  if (!ClobbersFPStack)
+    return;
+
   unsigned N = countTrailingOnes(STReturns);
 
   // FP registers used for function return must be consecutive starting at

diff --git a/llvm/test/CodeGen/X86/pr50782.ll b/llvm/test/CodeGen/X86/pr50782.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-w64-windows-gnu | FileCheck %s
+
+@a = global i32 0, align 4
+@b = global float 0.000000e+00, align 4
+@d = global float 0.000000e+00, align 4
+@f = global i32 0, align 4
+@g = global float 0.000000e+00, align 4
+@e = global i32 0, align 4
+@c = global float* null, align 4
+
+; The FP stack should be preserved across the call to __alloca.
+define void @h(float %i) {
+; CHECK-LABEL: h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $32, %esp
+; CHECK-NEXT:    movl %esp, %esi
+; CHECK-NEXT:    .cfi_offset %esi, -12
+; CHECK-NEXT:    flds 8(%ebp)
+; CHECK-NEXT:    movl _a, %ecx
+; CHECK-NEXT:    leal 3(%ecx), %eax
+; CHECK-NEXT:    andl $-4, %eax
+; CHECK-NEXT:    calll __alloca
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    andl $-16, %eax
+; CHECK-NEXT:    movl %eax, %esp
+; CHECK-NEXT:    fsts 8(%esi) # 4-byte Folded Spill
+; CHECK-NEXT:    fadds _b
+; CHECK-NEXT:    fsts _d
+; CHECK-NEXT:    fld1
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fld %st(2)
+; CHECK-NEXT:    je LBB0_2
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    movl _f, %ecx
+; CHECK-NEXT:    flds (%eax,%ecx,4)
+; CHECK-NEXT:    fld %st(3)
+; CHECK-NEXT:  LBB0_2: # %for.cond1.preheader
+; CHECK-NEXT:    movl _e, %ecx
+; CHECK-NEXT:    movl %ecx, 12(%esi)
+; CHECK-NEXT:    fildl 12(%esi)
+; CHECK-NEXT:    movl _c, %edx
+; CHECK-NEXT:    jmp LBB0_3
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB0_5: # %for.inc
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    fxch %st(5)
+; CHECK-NEXT:    fadd %st(4), %st
+; CHECK-NEXT:    fxch %st(5)
+; CHECK-NEXT:  LBB0_3: # %for.cond1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fld %st(5)
+; CHECK-NEXT:    fmul %st(4), %st
+; CHECK-NEXT:    fdiv %st(2), %st
+; CHECK-NEXT:    fadd %st(3), %st
+; CHECK-NEXT:    fsts _g
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    fucom %st(1)
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fnstsw %ax
+; CHECK-NEXT:    # kill: def $ah killed $ah killed $ax
+; CHECK-NEXT:    sahf
+; CHECK-NEXT:    jbe LBB0_5
+; CHECK-NEXT:  # %bb.4: # %if.then
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    flds 8(%esi) # 4-byte Folded Reload
+; CHECK-NEXT:    fstps (%edx,%ecx,4)
+; CHECK-NEXT:    jmp LBB0_5
+entry:
+  %0 = load i32, i32* @a, align 4
+  %1 = alloca i8, i32 %0, align 16
+  %2 = load float, float* @b, align 4
+  %add = fadd float %2, %i
+  store float %add, float* @d, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.cond1.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %3 = bitcast i8* %1 to float*
+  %4 = load i32, i32* @f, align 4
+  %arrayidx.le = getelementptr inbounds float, float* %3, i32 %4
+  %5 = load float, float* %arrayidx.le, align 4
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.body.preheader, %entry
+  %k.0.lcssa = phi float [ %5, %for.body.preheader ], [ undef, %entry ]
+  %l.0.lcssa = phi float [ %add, %for.body.preheader ], [ 1.000000e+00, %entry ]
+  %6 = load i32, i32* @e, align 4
+  %conv = sitofp i32 %6 to float
+  %7 = load float*, float** @c, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %7, i32 %6
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.cond1.preheader
+  %m.0 = phi float [ %add5, %for.inc ], [ %add, %for.cond1.preheader ]
+  %mul = fmul float %m.0, 0.000000e+00
+  %div = fdiv float %mul, %l.0.lcssa
+  %add2 = fadd float %k.0.lcssa, %div
+  store float %add2, float* @g, align 4
+  %cmp = fcmp olt float %add2, %conv
+  br i1 %cmp, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.cond1
+  store float %i, float* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.cond1
+  %add5 = fadd float %m.0, 1.000000e+00
+  br label %for.cond1
+}