Skip to content

Commit

Permalink
[WinEH] Allocate space in funclets stack to save XMM CSRs
Browse files Browse the repository at this point in the history
Summary:
This is an alternate approach to D57970.
Currently funclets reuse the same stack slots that are used in the
parent function for saving callee-saved xmm registers. If the parent
function modifies a callee-saved xmm register before an excpetion is
thrown, the catch handler will overwrite the original saved value.

This patch allocates space in funclets stack for saving callee-saved xmm
registers and uses RSP instead RBP to access memory.

Reviewers: andrew.w.kaylor, LuoYuanke, annita.zhang, craig.topper,
RKSimon

Subscribers: rnk, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D63396

Signed-off-by: pengfei <pengfei.wang@intel.com>
llvm-svn: 367088
  • Loading branch information
phoebewang committed Jul 26, 2019
1 parent 7f8c809 commit 9ad565f
Show file tree
Hide file tree
Showing 7 changed files with 244 additions and 57 deletions.
133 changes: 110 additions & 23 deletions llvm/lib/Target/X86/X86FrameLowering.cpp
Expand Up @@ -935,7 +935,10 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
; calls @llvm.eh.unwind.init
[if needs FP]
[for all callee-saved XMM registers]
movaps %<xmm reg>, -MMM(%rbp)
[if funclet]
movaps %<xmm reg>, -MMM(%rsp)
[else]
movaps %<xmm reg>, -MMM(%rbp)
[for all callee-saved XMM registers]
.seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
; i.e. the offset relative to (%rbp - SEHFrameOffset)
Expand All @@ -955,7 +958,10 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
; Emit CFI info
[if needs FP]
[for all callee-saved registers]
.cfi_offset %<reg>, (offset from %rbp)
[if funclet]
movaps -MMM(%rsp), %<xmm reg>
[else]
.cfi_offset %<reg>, (offset from %rbp)
[else]
.cfi_def_cfa_offset (offset from RETADDR)
[for all callee-saved registers]
Expand Down Expand Up @@ -1177,11 +1183,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
MFI.setOffsetAdjustment(-StackSize);
}

// For EH funclets, only allocate enough space for outgoing calls. Save the
// NumBytes value that we would've used for the parent frame.
// For EH funclets, only allocate enough space for outgoing calls and callee
// saved XMM registers on Windows 64 bits. Save the NumBytes value that we
// would've used for the parent frame.
int XMMFrameSlotOrigin;
unsigned ParentFrameNumBytes = NumBytes;
if (IsFunclet)
if (IsFunclet) {
NumBytes = getWinEHFuncletFrameSize(MF);
if (IsWin64Prologue)
NumBytes += X86FI->getCalleeSavedXMMFrameInfo(XMMFrameSlotOrigin);
}

// Skip the callee-saved push instructions.
bool PushedRegs = false;
Expand Down Expand Up @@ -1389,19 +1400,33 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
}

while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
const MachineInstr &FrameInstr = *MBBI;
auto FrameInstr = MBBI;
++MBBI;

if (NeedsWinCFI) {
int FI;
if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
if (unsigned Reg = TII.isStoreToStackSlot(*FrameInstr, FI)) {
if (X86::FR64RegClass.contains(Reg)) {
unsigned IgnoredFrameReg;
int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
Offset += SEHFrameOffset;

int Offset = 0;
HasWinCFI = true;
assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
if (IsFunclet) {
assert(IsWin64Prologue && "Only valid on Windows 64bit");
unsigned Size = TRI->getSpillSize(X86::VR128RegClass);
unsigned Align = TRI->getSpillAlignment(X86::VR128RegClass);
Offset = (FI - XMMFrameSlotOrigin - 1) * Size +
alignDown(NumBytes, Align);
addRegOffset(BuildMI(MBB, MBBI, DL,
TII.get(getXMMAlignedLoadStoreOp(false))),
StackPtr, true, Offset)
.addReg(Reg)
.setMIFlag(MachineInstr::FrameSetup);
MBB.erase(FrameInstr);
} else {
assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
unsigned IgnoredFrameReg;
Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
SEHFrameOffset;
}
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
.addImm(Reg)
.addImm(Offset)
Expand Down Expand Up @@ -1621,6 +1646,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (IsFunclet) {
assert(HasFP && "EH funclets without FP not yet implemented");
NumBytes = getWinEHFuncletFrameSize(MF);
int Ignore;
if (IsWin64Prologue)
NumBytes += X86FI->getCalleeSavedXMMFrameInfo(Ignore);
} else if (HasFP) {
// Calculate required stack adjustment.
uint64_t FrameSize = StackSize - SlotSize;
Expand Down Expand Up @@ -1948,6 +1976,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();

unsigned CalleeSavedFrameSize = 0;
unsigned CalleeSavedXMMFrameSize = 0;
int CalleeSavedXMMSlotOrigin = 0;
int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();

int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
Expand Down Expand Up @@ -2011,9 +2041,44 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);

// Assign slots for XMMs.
for (unsigned i = CSI.size(), Size = 0; i != 0; --i) {
unsigned Reg = CSI[i - 1].getReg();
// According to Microsoft "x64 software conventions", only XMM registers
// are nonvolatile except the GPR.
if (!X86::VR128RegClass.contains(Reg))
continue;
// Since all registers have the same size, we just initialize once.
if (Size == 0) {
unsigned Align = TRI->getSpillAlignment(X86::VR128RegClass);
// ensure alignment
int Remainder = SpillSlotOffset % Align;
if (Remainder < 0)
SpillSlotOffset -= Align + Remainder;
else
SpillSlotOffset -= Remainder;
MFI.ensureMaxAlignment(Align);
Size = TRI->getSpillSize(X86::VR128RegClass);
}
// spill into slot
SpillSlotOffset -= Size;
int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
CSI[i - 1].setFrameIdx(SlotIndex);
// Since we allocate XMM slot consecutively in stack, we just need to
// record the first one for the funclet use.
if (CalleeSavedXMMFrameSize == 0) {
CalleeSavedXMMSlotOrigin = SlotIndex;
}
CalleeSavedXMMFrameSize += Size;
}

X86FI->setCalleeSavedXMMFrameInfo(CalleeSavedXMMFrameSize,
CalleeSavedXMMSlotOrigin);

// Assign slots for others.
for (unsigned i = CSI.size(); i != 0; --i) {
unsigned Reg = CSI[i - 1].getReg();
if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg) ||
X86::VR128RegClass.contains(Reg))
continue;

// If this is k-register make sure we lookup via the largest legal type.
Expand All @@ -2025,7 +2090,11 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
unsigned Size = TRI->getSpillSize(*RC);
unsigned Align = TRI->getSpillAlignment(*RC);
// ensure alignment
SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
int Remainder = SpillSlotOffset % Align;
if (Remainder < 0)
SpillSlotOffset -= Align + Remainder;
else
SpillSlotOffset -= Remainder;
// spill into slot
SpillSlotOffset -= Size;
int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
Expand Down Expand Up @@ -2164,19 +2233,32 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
DebugLoc DL = MBB.findDebugLoc(MI);

// Reload XMMs from stack frame.
MachineFunction &MF = *MBB.getParent();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
int XMMFrameSlotOrigin;
int SEHFrameOffset = X86FI->getCalleeSavedXMMFrameInfo(XMMFrameSlotOrigin) +
MF.getFrameInfo().getMaxCallFrameSize();
for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
unsigned Reg = CSI[i].getReg();
if (X86::GR64RegClass.contains(Reg) ||
X86::GR32RegClass.contains(Reg))
continue;
if (MBB.isEHFuncletEntry() && STI.is64Bit()) {
if (X86::VR128RegClass.contains(Reg)) {
int Offset = (CSI[i].getFrameIdx() - XMMFrameSlotOrigin - 1) * 16;
addRegOffset(BuildMI(MBB, MI, DL,
TII.get(getXMMAlignedLoadStoreOp(true)), Reg),
X86::RSP, true, SEHFrameOffset + Offset);
}
} else {
if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
continue;

// If this is k-register make sure we lookup via the largest legal type.
MVT VT = MVT::Other;
if (X86::VK16RegClass.contains(Reg))
VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
// If this is k-register make sure we lookup via the largest legal type.
MVT VT = MVT::Other;
if (X86::VK16RegClass.contains(Reg))
VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;

const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
}
}

// POP GPRs.
Expand Down Expand Up @@ -3185,3 +3267,8 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
UnwindHelpFI)
.addImm(-2);
}

unsigned X86FrameLowering::getXMMAlignedLoadStoreOp(const bool IsLoad) const {
return IsLoad ? (STI.hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm)
: (STI.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr);
}
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86FrameLowering.h
Expand Up @@ -217,6 +217,10 @@ class X86FrameLowering : public TargetFrameLowering {
void emitCatchRetReturnValue(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineInstr *CatchRet) const;

/// Select the best opcode for the subtarget when funclet XMM CSRs
/// save/restore.
unsigned getXMMAlignedLoadStoreOp(const bool IsLoad) const;
};

} // End llvm namespace
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/X86/X86MachineFunctionInfo.h
Expand Up @@ -40,6 +40,14 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// stack frame in bytes.
unsigned CalleeSavedFrameSize = 0;

/// CalleeSavedXMMFrameSize - Size of the callee-saved XMM register portion
/// of the stack frame in bytes.
unsigned CalleeSavedXMMFrameSize = 0;

/// CalleeSavedXMMFrameOrigin - Origin slot of the callee-saved XMM register
/// portion of the stack frame.
int CalleeSavedXMMFrameOrigin = 0;

/// BytesToPopOnReturn - Number of bytes function pops on return (in addition
/// to the space used by the return address).
/// Used on windows platform for stdcall & fastcall name decoration
Expand Down Expand Up @@ -123,6 +131,11 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }

unsigned getCalleeSavedXMMFrameInfo(int &origin) const
{ origin = CalleeSavedXMMFrameOrigin; return CalleeSavedXMMFrameSize; }
void setCalleeSavedXMMFrameInfo(unsigned size, int origin)
{ CalleeSavedXMMFrameSize = size; CalleeSavedXMMFrameOrigin = origin; }

unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}

Expand Down
56 changes: 28 additions & 28 deletions llvm/test/CodeGen/X86/avx512-intel-ocl.ll
Expand Up @@ -184,14 +184,14 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; WIN64-KNL-LABEL: test_prolog_epilog:
; WIN64-KNL: # %bb.0:
; WIN64-KNL-NEXT: pushq %rbp
; WIN64-KNL-NEXT: subq $1328, %rsp # imm = 0x530
; WIN64-KNL-NEXT: subq $1264, %rsp # imm = 0x4F0
; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
; WIN64-KNL-NEXT: kmovw %k7, 1198(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k6, 1196(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k5, 1194(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k4, 1192(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm21, 1104(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm20, 992(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: kmovw %k7, 1134(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k6, 1132(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k5, 1130(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: kmovw %k4, 1128(%rbp) # 2-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill
; WIN64-KNL-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill
Expand Down Expand Up @@ -226,26 +226,26 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; WIN64-KNL-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 992(%rbp), %zmm20 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 1104(%rbp), %zmm21 # 64-byte Reload
; WIN64-KNL-NEXT: kmovw 1192(%rbp), %k4 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1194(%rbp), %k5 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1196(%rbp), %k6 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1198(%rbp), %k7 # 2-byte Reload
; WIN64-KNL-NEXT: leaq 1200(%rbp), %rsp
; WIN64-KNL-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload
; WIN64-KNL-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload
; WIN64-KNL-NEXT: kmovw 1128(%rbp), %k4 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1130(%rbp), %k5 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1132(%rbp), %k6 # 2-byte Reload
; WIN64-KNL-NEXT: kmovw 1134(%rbp), %k7 # 2-byte Reload
; WIN64-KNL-NEXT: leaq 1136(%rbp), %rsp
; WIN64-KNL-NEXT: popq %rbp
; WIN64-KNL-NEXT: retq
;
; WIN64-SKX-LABEL: test_prolog_epilog:
; WIN64-SKX: # %bb.0:
; WIN64-SKX-NEXT: pushq %rbp
; WIN64-SKX-NEXT: subq $1328, %rsp # imm = 0x530
; WIN64-SKX-NEXT: subq $1264, %rsp # imm = 0x4F0
; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
; WIN64-SKX-NEXT: kmovq %k7, 1192(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k6, 1184(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k5, 1176(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k4, 1168(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm21, 1056(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: kmovq %k7, 1128(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k6, 1120(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k5, 1112(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: kmovq %k4, 1104(%rbp) # 8-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm21, 1024(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill
; WIN64-SKX-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill
Expand Down Expand Up @@ -282,12 +282,12 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; WIN64-SKX-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload
; WIN64-SKX-NEXT: vmovaps 1056(%rbp), %zmm21 # 64-byte Reload
; WIN64-SKX-NEXT: kmovq 1168(%rbp), %k4 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1176(%rbp), %k5 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1184(%rbp), %k6 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1192(%rbp), %k7 # 8-byte Reload
; WIN64-SKX-NEXT: leaq 1200(%rbp), %rsp
; WIN64-SKX-NEXT: vmovaps 1024(%rbp), %zmm21 # 64-byte Reload
; WIN64-SKX-NEXT: kmovq 1104(%rbp), %k4 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1112(%rbp), %k5 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1120(%rbp), %k6 # 8-byte Reload
; WIN64-SKX-NEXT: kmovq 1128(%rbp), %k7 # 8-byte Reload
; WIN64-SKX-NEXT: leaq 1136(%rbp), %rsp
; WIN64-SKX-NEXT: popq %rbp
; WIN64-SKX-NEXT: retq
;
Expand Down Expand Up @@ -346,7 +346,7 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; X64-SKX: ## %bb.0:
; X64-SKX-NEXT: pushq %rsi
; X64-SKX-NEXT: pushq %rdi
; X64-SKX-NEXT: subq $1192, %rsp ## imm = 0x4A8
; X64-SKX-NEXT: subq $1064, %rsp ## imm = 0x428
; X64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill
; X64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill
Expand Down Expand Up @@ -388,7 +388,7 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload
; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload
; X64-SKX-NEXT: addq $1192, %rsp ## imm = 0x4A8
; X64-SKX-NEXT: addq $1064, %rsp ## imm = 0x428
; X64-SKX-NEXT: popq %rdi
; X64-SKX-NEXT: popq %rsi
; X64-SKX-NEXT: retq
Expand Down
15 changes: 15 additions & 0 deletions llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll
Expand Up @@ -51,3 +51,18 @@ catch:
; CHECK: popq %rbp
; CHECK: retq
; CHECK: .seh_handlerdata
; CHECK: # %catch
; CHECK: movq %rdx, 16(%rsp)
; CHECK: pushq %rbp
; CHECK: .seh_pushreg 5
; CHECK: subq $48, %rsp
; CHECK: .seh_stackalloc 48
; CHECK: leaq 64(%rdx), %rbp
; CHECK: movapd %xmm6, 32(%rsp)
; CHECK: .seh_savexmm 6, 32
; CHECK: .seh_endprologue
; CHECK: movapd 32(%rsp), %xmm6
; CHECK: leaq .LBB0_1(%rip), %rax
; CHECK: addq $48, %rsp
; CHECK: popq %rbp
; CHECK: retq # CATCHRET

0 comments on commit 9ad565f

Please sign in to comment.