Skip to content

Commit

Permalink
CXX_FAST_TLS calling convention: performance improvement for AArch64.
Browse files Browse the repository at this point in the history
The access function has a short entry and a short exit, the initialization
block is only run the first time. To improve the performance, we want to
have a short frame at the entry and exit.

We explicitly handle most of the CSRs via copies. Only the CSRs that are not
handled via copies will be in CSR_SaveList.

Frame lowering and prologue/epilogue insertion will generate a short frame
in the entry and exit according to CSR_SaveList. The majority of the CSRs will
be handled by register allcoator. Register allocator will try to spill and
reload them in the initialization block.

We add CSRsViaCopy, it will be explicitly handled during lowering.

1> we first set FunctionLoweringInfo->SplitCSR if conditions are met (the target
   supports it for the given machine function and the function has only return
   exits). We also call TLI->initializeSplitCSR to perform initialization.
2> we call TLI->insertCopiesSplitCSR to insert copies from CSRsViaCopy to
   virtual registers at beginning of the entry block and copies from virtual
   registers to CSRsViaCopy at beginning of the exit blocks.
3> we also need to make sure the explicit copies will not be eliminated.

The target independent portion was committed as r255353.
rdar://problem/23557469

Differential Revision: http://reviews.llvm.org/D15341

llvm-svn: 255821
  • Loading branch information
manman-ren committed Dec 16, 2015
1 parent 9a5b052 commit cbe4f94
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 45 deletions.
8 changes: 8 additions & 0 deletions llvm/lib/Target/AArch64/AArch64CallingConvention.td
Expand Up @@ -288,6 +288,14 @@ def CSR_AArch64_CXX_TLS_Darwin
(sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
(sequence "D%u", 0, 31))>;

// CSRs that are handled by prologue, epilogue.
def CSR_AArch64_CXX_TLS_Darwin_PE
: CalleeSavedRegs<(add LR, FP)>;

// CSRs that are handled explicitly via copies.
def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
: CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;

// The ELF stub used for TLS-descriptor access saves every feasible
// register. Only X0 and LR are clobbered.
def CSR_AArch64_TLS_ELF
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64FastISel.cpp
Expand Up @@ -3646,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
if (F.isVarArg())
return false;

if (TLI.supportSplitCSR(FuncInfo.MF))
return false;

// Build a list of return value registers.
SmallVector<unsigned, 4> RetRegs;

Expand Down
59 changes: 59 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -3271,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (AArch64::GPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else if (AArch64::FPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}

RetOps[0] = Chain; // Update chain.

Expand Down Expand Up @@ -10003,3 +10016,49 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
}

void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
AFI->setIsSplitCSR(true);
}

void AArch64TargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;

const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (AArch64::GPR64RegClass.contains(*I))
RC = &AArch64::GPR64RegClass;
else if (AArch64::FPR64RegClass.contains(*I))
RC = &AArch64::FPR64RegClass;
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");

unsigned NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
assert(Entry->getParent()->getFunction()->hasFnAttribute(
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
NewVR)
.addReg(*I);

for (auto *Exit : Exits)
BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
*I)
.addReg(NewVR);
}
}
8 changes: 8 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Expand Up @@ -385,6 +385,14 @@ class AArch64TargetLowering : public TargetLowering {
bool isCheapToSpeculateCtlz() const override {
return true;
}
bool supportSplitCSR(MachineFunction *MF) const override {
return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
}
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;

private:
bool isExtFreeImpl(const Instruction *Ext) const override;
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
Expand Up @@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
/// registers.
unsigned VarArgsFPRSize;

/// True if this function has a subset of CSRs that is handled explicitly via
/// copies.
bool IsSplitCSR;

public:
AArch64FunctionInfo()
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
IsSplitCSR(false) {}

explicit AArch64FunctionInfo(MachineFunction &MF)
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
IsSplitCSR(false) {
(void)MF;
}

Expand All @@ -96,6 +102,9 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }

bool isSplitCSR() const { return IsSplitCSR; }
void setIsSplitCSR(bool s) { IsSplitCSR = s; }

void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
unsigned getLocalStackSize() const { return LocalStackSize; }

Expand Down
14 changes: 13 additions & 1 deletion llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
Expand Up @@ -15,6 +15,7 @@
#include "AArch64RegisterInfo.h"
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
Expand Down Expand Up @@ -47,11 +48,22 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_SaveList;
if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
return CSR_AArch64_CXX_TLS_Darwin_SaveList;
return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
CSR_AArch64_CXX_TLS_Darwin_SaveList;
else
return CSR_AArch64_AAPCS_SaveList;
}

const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
return nullptr;
}

const uint32_t *
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64RegisterInfo.h
Expand Up @@ -35,6 +35,8 @@ struct AArch64RegisterInfo : public AArch64GenRegisterInfo {

/// Code Generation virtual methods...
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const MCPhysReg *
getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;

Expand Down
83 changes: 41 additions & 42 deletions llvm/test/CodeGen/AArch64/cxx-tlscc.ll
Expand Up @@ -13,7 +13,7 @@ declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)

define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
%.b.i = load i1, i1* @__tls_guard, align 1
br i1 %.b.i, label %__tls_init.exit, label %init.i

Expand All @@ -28,50 +28,49 @@ __tls_init.exit:
}

; CHECK-LABEL: _ZTW2sg
; CHECK-DAG: stp d31, d30
; CHECK-DAG: stp d29, d28
; CHECK-DAG: stp d27, d26
; CHECK-DAG: stp d25, d24
; CHECK-DAG: stp d23, d22
; CHECK-DAG: stp d21, d20
; CHECK-DAG: stp d19, d18
; CHECK-DAG: stp d17, d16
; CHECK-DAG: stp d7, d6
; CHECK-DAG: stp d5, d4
; CHECK-DAG: stp d3, d2
; CHECK-DAG: stp d1, d0
; CHECK-DAG: stp x20, x19
; CHECK-DAG: stp x14, x13
; CHECK-DAG: stp x12, x11
; CHECK-DAG: stp x10, x9
; CHECK-DAG: stp x8, x7
; CHECK-DAG: stp x6, x5
; CHECK-DAG: stp x4, x3
; CHECK-DAG: stp x2, x1
; CHECK-DAG: stp x29, x30
; CHECK-NOT: stp d31, d30
; CHECK-NOT: stp d29, d28
; CHECK-NOT: stp d27, d26
; CHECK-NOT: stp d25, d24
; CHECK-NOT: stp d23, d22
; CHECK-NOT: stp d21, d20
; CHECK-NOT: stp d19, d18
; CHECK-NOT: stp d17, d16
; CHECK-NOT: stp d7, d6
; CHECK-NOT: stp d5, d4
; CHECK-NOT: stp d3, d2
; CHECK-NOT: stp d1, d0
; CHECK-NOT: stp x20, x19
; CHECK-NOT: stp x14, x13
; CHECK-NOT: stp x12, x11
; CHECK-NOT: stp x10, x9
; CHECK-NOT: stp x8, x7
; CHECK-NOT: stp x6, x5
; CHECK-NOT: stp x4, x3
; CHECK-NOT: stp x2, x1
; CHECK: blr
; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
; CHECK: blr
; CHECK: tlv_atexit
; CHECK: [[BB_end]]:
; CHECK: blr
; CHECK-DAG: ldp x2, x1
; CHECK-DAG: ldp x4, x3
; CHECK-DAG: ldp x6, x5
; CHECK-DAG: ldp x8, x7
; CHECK-DAG: ldp x10, x9
; CHECK-DAG: ldp x12, x11
; CHECK-DAG: ldp x14, x13
; CHECK-DAG: ldp x20, x19
; CHECK-DAG: ldp d1, d0
; CHECK-DAG: ldp d3, d2
; CHECK-DAG: ldp d5, d4
; CHECK-DAG: ldp d7, d6
; CHECK-DAG: ldp d17, d16
; CHECK-DAG: ldp d19, d18
; CHECK-DAG: ldp d21, d20
; CHECK-DAG: ldp d23, d22
; CHECK-DAG: ldp d25, d24
; CHECK-DAG: ldp d27, d26
; CHECK-DAG: ldp d29, d28
; CHECK-DAG: ldp d31, d30
; CHECK-NOT: ldp x2, x1
; CHECK-NOT: ldp x4, x3
; CHECK-NOT: ldp x6, x5
; CHECK-NOT: ldp x8, x7
; CHECK-NOT: ldp x10, x9
; CHECK-NOT: ldp x12, x11
; CHECK-NOT: ldp x14, x13
; CHECK-NOT: ldp x20, x19
; CHECK-NOT: ldp d1, d0
; CHECK-NOT: ldp d3, d2
; CHECK-NOT: ldp d5, d4
; CHECK-NOT: ldp d7, d6
; CHECK-NOT: ldp d17, d16
; CHECK-NOT: ldp d19, d18
; CHECK-NOT: ldp d21, d20
; CHECK-NOT: ldp d23, d22
; CHECK-NOT: ldp d25, d24
; CHECK-NOT: ldp d27, d26
; CHECK-NOT: ldp d29, d28
; CHECK-NOT: ldp d31, d30

0 comments on commit cbe4f94

Please sign in to comment.