Skip to content

Commit

Permalink
[X86] Vectorcall Calling Convention - Adding CodeGen Complete Support
Browse files Browse the repository at this point in the history
The vectorcall calling convention specifies that arguments to functions are to be passed in registers, when possible.
vectorcall uses more registers for arguments than fastcall or the default x64 calling convention use. 
The vectorcall calling convention is only supported in native code on x86 and x64 processors that include Streaming SIMD Extensions 2 (SSE2) and above.

The current implementation does not handle Homogeneous Vector Aggregates (HVAs) correctly and this review attempts to fix it.
This aubmit also includes additional lit tests to cover better HVAs corner cases.

Differential Revision: https://reviews.llvm.org/D27392

llvm-svn: 290240
  • Loading branch information
Oren Ben Simhon committed Dec 21, 2016
1 parent dcf5b72 commit 3b95157
Show file tree
Hide file tree
Showing 9 changed files with 471 additions and 72 deletions.
48 changes: 48 additions & 0 deletions llvm/include/llvm/CodeGen/CallingConvLower.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,12 @@ class CCState {
void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
CCAssignFn Fn);

/// The function will invoke AnalyzeFormalArguments.
void AnalyzeArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
CCAssignFn Fn) {
AnalyzeFormalArguments(Ins, Fn);
}

/// AnalyzeReturn - Analyze the returned values of a return,
/// incorporating info about the result values into this state.
void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
Expand All @@ -318,11 +324,22 @@ class CCState {
SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
CCAssignFn Fn);

/// The function will invoke AnalyzeCallOperands.
void AnalyzeArguments(const SmallVectorImpl<ISD::OutputArg> &Outs,
CCAssignFn Fn) {
AnalyzeCallOperands(Outs, Fn);
}

/// AnalyzeCallResult - Analyze the return values of a call,
/// incorporating info about the passed values into this state.
void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
CCAssignFn Fn);

/// A shadow allocated register is a register that was allocated
/// but wasn't added to the location list (Locs).
/// \returns true if the register was allocated as shadow or false otherwise.
bool IsShadowAllocatedReg(unsigned Reg) const;

/// AnalyzeCallResult - Same as above except it's specialized for calls which
/// produce a single value.
void AnalyzeCallResult(MVT VT, CCAssignFn Fn);
Expand Down Expand Up @@ -521,6 +538,37 @@ class CCState {
const SmallVectorImpl<ISD::InputArg> &Ins,
CCAssignFn CalleeFn, CCAssignFn CallerFn);

/// The function runs an additional analysis pass over function arguments.
/// It will mark each argument with the attribute flag SecArgPass.
/// After running, it will sort the locs list.
template <class T>
void AnalyzeArgumentsSecondPass(const SmallVectorImpl<T> &Args,
CCAssignFn Fn) {
unsigned NumFirstPassLocs = Locs.size();

/// Creates similar argument list to \p Args in which each argument is
/// marked using SecArgPass flag.
SmallVector<T, 16> SecPassArg;
// SmallVector<ISD::InputArg, 16> SecPassArg;
for (auto Arg : Args) {
Arg.Flags.setSecArgPass();
SecPassArg.push_back(Arg);
}

// Run the second argument pass
AnalyzeArguments(SecPassArg, Fn);

// Sort the locations of the arguments according to their original position.
SmallVector<CCValAssign, 16> TmpArgLocs;
std::swap(TmpArgLocs, Locs);
auto B = TmpArgLocs.begin(), E = TmpArgLocs.end();
std::merge(B, B + NumFirstPassLocs, B + NumFirstPassLocs, E,
std::back_inserter(Locs),
[](const CCValAssign &A, const CCValAssign &B) -> bool {
return A.getValNo() < B.getValNo();
});
}

private:
/// MarkAllocated - Mark a register and all of its aliases as allocated.
void MarkAllocated(unsigned Reg);
Expand Down
18 changes: 18 additions & 0 deletions llvm/include/llvm/Target/TargetCallingConv.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ namespace ISD {
static const uint64_t SwiftSelfOffs = 14;
static const uint64_t SwiftError = 1ULL<<15; ///< Swift error parameter
static const uint64_t SwiftErrorOffs = 15;
static const uint64_t Hva = 1ULL << 16; ///< HVA field for
///< vectorcall
static const uint64_t HvaOffs = 16;
static const uint64_t HvaStart = 1ULL << 17; ///< HVA structure start
///< for vectorcall
static const uint64_t HvaStartOffs = 17;
static const uint64_t SecArgPass = 1ULL << 18; ///< Second argument
///< pass for vectorcall
static const uint64_t SecArgPassOffs = 18;
static const uint64_t OrigAlign = 0x1FULL<<27;
static const uint64_t OrigAlignOffs = 27;
static const uint64_t ByValSize = 0x3fffffffULL<<32; ///< Struct size
Expand Down Expand Up @@ -91,6 +100,15 @@ namespace ISD {
bool isSwiftError() const { return Flags & SwiftError; }
void setSwiftError() { Flags |= One << SwiftErrorOffs; }

bool isHva() const { return Flags & Hva; }
void setHva() { Flags |= One << HvaOffs; }

bool isHvaStart() const { return Flags & HvaStart; }
void setHvaStart() { Flags |= One << HvaStartOffs; }

bool isSecArgPass() const { return Flags & SecArgPass; }
void setSecArgPass() { Flags |= One << SecArgPassOffs; }

bool isNest() const { return Flags & Nest; }
void setNest() { Flags |= One << NestOffs; }

Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/CodeGen/CallingConvLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>

using namespace llvm;

CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
Expand Down Expand Up @@ -64,6 +66,22 @@ void CCState::MarkAllocated(unsigned Reg) {
UsedRegs[*AI/32] |= 1 << (*AI&31);
}

bool CCState::IsShadowAllocatedReg(unsigned Reg) const {
if (!isAllocated(Reg))
return false;

for (auto const &ValAssign : Locs) {
if (ValAssign.isRegLoc()) {
for (MCRegAliasIterator AI(ValAssign.getLocReg(), &TRI, true);
AI.isValid(); ++AI) {
if (*AI == Reg)
return false;
}
}
}
return true;
}

/// Analyze an array of argument values,
/// incorporating info about the formals into this state.
void
Expand Down
26 changes: 24 additions & 2 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7732,8 +7732,19 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
Flags.setZExt();
if (Args[i].isSExt)
Flags.setSExt();
if (Args[i].isInReg)
if (Args[i].isInReg) {
// If we are using vectorcall calling convention, a structure that is
// passed InReg - is surely an HVA
if (CLI.CallConv == CallingConv::X86_VectorCall &&
isa<StructType>(FinalType)) {
// The first value of a structure is marked
if (0 == Value)
Flags.setHvaStart();
Flags.setHva();
}
// Set InReg Flag
Flags.setInReg();
}
if (Args[i].isSRet)
Flags.setSRet();
if (Args[i].isSwiftSelf)
Expand Down Expand Up @@ -8019,8 +8030,19 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
Flags.setZExt();
if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
Flags.setSExt();
if (F.getAttributes().hasAttribute(Idx, Attribute::InReg))
if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) {
// If we are using vectorcall calling convention, a structure that is
// passed InReg - is surely an HVA
if (F.getCallingConv() == CallingConv::X86_VectorCall &&
isa<StructType>(I->getType())) {
// The first value of a structure is marked
if (0 == Value)
Flags.setHvaStart();
Flags.setHva();
}
// Set InReg Flag
Flags.setInReg();
}
if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
Flags.setSRet();
if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf))
Expand Down
154 changes: 151 additions & 3 deletions llvm/lib/Target/X86/X86CallingConv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//

#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/IR/CallingConv.h"

Expand All @@ -39,14 +40,14 @@ bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
if (AvailableRegs.size() < RequiredGprsUponSplit)
return false; // Not enough free registers - continue the search.

// Allocating the available registers
// Allocating the available registers.
for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {

// Marking the register as located
// Marking the register as located.
unsigned Reg = State.AllocateReg(AvailableRegs[I]);

// Since we previously made sure that 2 registers are available
// we expect that a real register number will be returned
// we expect that a real register number will be returned.
assert(Reg && "Expecting a register will be available");

// Assign the value to the allocated register
Expand All @@ -57,4 +58,151 @@ bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}

static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
if (ValVT.is512BitVector()) {
static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
X86::ZMM3, X86::ZMM4, X86::ZMM5};
return RegListZMM;
}

if (ValVT.is256BitVector()) {
static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
X86::YMM3, X86::YMM4, X86::YMM5};
return RegListYMM;
}

static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
X86::XMM3, X86::XMM4, X86::XMM5};
return RegListXMM;
}

static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
return RegListGPR;
}

static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State) {

ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
bool Is64bit = static_cast<const X86Subtarget &>(
State.getMachineFunction().getSubtarget())
.is64Bit();

for (auto Reg : RegList) {
// If the register is not marked as allocated - assign to it.
if (!State.isAllocated(Reg)) {
unsigned AssigedReg = State.AllocateReg(Reg);
assert(AssigedReg == Reg && "Expecting a valid register allocation");
State.addLoc(
CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
return true;
}
// If the register is marked as shadow allocated - assign to it.
if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}
}

llvm_unreachable("Clang should ensure that hva marked vectors will have "
"an available register.");
return false;
}

bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// On the second pass, go through the HVAs only.
if (ArgFlags.isSecArgPass()) {
if (ArgFlags.isHva())
return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
ArgFlags, State);
return true;
}

// Process only vector types as defined by vectorcall spec:
// "A vector type is either a floating-point type, for example,
// a float or double, or an SIMD vector type, for example, __m128 or __m256".
if (!(ValVT.isFloatingPoint() ||
(ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
// If R9 was already assigned it means that we are after the fourth element
// and because this is not an HVA / Vector type, we need to allocate
// shadow XMM register.
if (State.isAllocated(X86::R9)) {
// Assign shadow XMM register.
(void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
}

return false;
}

if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
// Assign shadow GPR register.
(void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());

// Assign XMM register - (shadow for HVA and non-shadow for non HVA).
if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
// In Vectorcall Calling convention, additional shadow stack can be
// created on top of the basic 32 bytes of win64.
// It can happen if the fifth or sixth argument is vector type or HVA.
// At that case for each argument a shadow stack of 8 bytes is allocated.
if (Reg == X86::XMM4 || Reg == X86::XMM5)
State.AllocateStack(8, 8);

if (!ArgFlags.isHva()) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true; // Allocated a register - Stop the search.
}
}
}

// If this is an HVA - Stop the search,
// otherwise continue the search.
return ArgFlags.isHva();
}

bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// On the second pass, go through the HVAs only.
if (ArgFlags.isSecArgPass()) {
if (ArgFlags.isHva())
return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
ArgFlags, State);
return true;
}

// Process only vector types as defined by vectorcall spec:
// "A vector type is either a floating point type, for example,
// a float or double, or an SIMD vector type, for example, __m128 or __m256".
if (!(ValVT.isFloatingPoint() ||
(ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
return false;
}

if (ArgFlags.isHva())
return true; // If this is an HVA - Stop the search.

// Assign XMM register.
if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}

// In case we did not find an available XMM register for a vector -
// pass it indirectly.
// It is similar to CCPassIndirect, with the addition of inreg.
if (!ValVT.isFloatingPoint()) {
LocVT = MVT::i32;
LocInfo = CCValAssign::Indirect;
ArgFlags.setInReg();
}

return false; // No register was assigned - Continue the search.
}

} // End llvm namespace
31 changes: 19 additions & 12 deletions llvm/lib/Target/X86/X86CallingConv.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,29 @@ namespace llvm {
/// When regcall calling convention compiled to 32 bit arch, special treatment
/// is required for 64 bit masks.
/// The value should be assigned to two GPRs.
/// @return true if registers were allocated and false otherwise
/// \return true if registers were allocated and false otherwise.
bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State);

inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags,
CCState &State) {
// Similar to CCPassIndirect, with the addition of inreg.
LocVT = MVT::i32;
LocInfo = CCValAssign::Indirect;
ArgFlags.setInReg();
return false; // Continue the search, but now for i32.
}
/// Vectorcall calling convention has special handling for vector types or
/// HVA for 64 bit arch.
/// For HVAs shadow registers might be allocated on the first pass
/// and actual XMM registers are allocated on the second pass.
/// For vector types, actual XMM registers are allocated on the first pass.
/// \return true if registers were allocated and false otherwise.
bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State);

/// Vectorcall calling convention has special handling for vector types or
/// HVA for 32 bit arch.
/// For HVAs actual XMM registers are allocated on the second pass.
/// For vector types, actual XMM registers are allocated on the first pass.
/// \return true if registers were allocated and false otherwise.
bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State);

inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
Expand Down
Loading

0 comments on commit 3b95157

Please sign in to comment.