Skip to content

Commit

Permalink
Re-land "[MS] Overhaul how clang passes overaligned args on x86_32"
Browse files Browse the repository at this point in the history
This brings back 2af74e2 and reverts
eaabaf7.

The changes were correct, the code that was broken contained an ODR
violation that assumed that these types are passed equivalently:
  struct alignas(uint64_t) Wrapper { uint64_t P };
  void f(uint64_t p);
  void f(Wrapper p);

MSVC does not pass them the same way, and so clang-cl should not pass
them the same way either.
  • Loading branch information
rnk committed Feb 12, 2020
1 parent 413307d commit 2c6a389
Show file tree
Hide file tree
Showing 6 changed files with 269 additions and 33 deletions.
17 changes: 14 additions & 3 deletions clang/include/clang/CodeGen/CGFunctionInfo.h
Expand Up @@ -88,6 +88,7 @@ class ABIArgInfo {
Kind TheKind;
bool PaddingInReg : 1;
bool InAllocaSRet : 1; // isInAlloca()
bool InAllocaIndirect : 1;// isInAlloca()
bool IndirectByVal : 1; // isIndirect()
bool IndirectRealign : 1; // isIndirect()
bool SRetAfterThis : 1; // isIndirect()
Expand All @@ -110,8 +111,8 @@ class ABIArgInfo {

public:
ABIArgInfo(Kind K = Direct)
: TypeData(nullptr), PaddingType(nullptr), DirectOffset(0),
TheKind(K), PaddingInReg(false), InAllocaSRet(false),
: TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K),
PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false),
IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false),
InReg(false), CanBeFlattened(false), SignExt(false) {}

Expand Down Expand Up @@ -185,9 +186,10 @@ class ABIArgInfo {
AI.setInReg(true);
return AI;
}
static ABIArgInfo getInAlloca(unsigned FieldIndex) {
static ABIArgInfo getInAlloca(unsigned FieldIndex, bool Indirect = false) {
auto AI = ABIArgInfo(InAlloca);
AI.setInAllocaFieldIndex(FieldIndex);
AI.setInAllocaIndirect(Indirect);
return AI;
}
static ABIArgInfo getExpand() {
Expand Down Expand Up @@ -380,6 +382,15 @@ class ABIArgInfo {
AllocaFieldIndex = FieldIndex;
}

unsigned getInAllocaIndirect() const {
assert(isInAlloca() && "Invalid kind!");
return InAllocaIndirect;
}
void setInAllocaIndirect(bool Indirect) {
assert(isInAlloca() && "Invalid kind!");
InAllocaIndirect = Indirect;
}

/// Return true if this field of an inalloca struct should be returned
/// to implement a struct return calling convention.
bool getInAllocaSRet() const {
Expand Down
36 changes: 30 additions & 6 deletions clang/lib/CodeGen/CGCall.cpp
Expand Up @@ -2370,6 +2370,9 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
auto FieldIndex = ArgI.getInAllocaFieldIndex();
Address V =
Builder.CreateStructGEP(ArgStruct, FieldIndex, Arg->getName());
if (ArgI.getInAllocaIndirect())
V = Address(Builder.CreateLoad(V),
getContext().getTypeAlignInChars(Ty));
ArgVals.push_back(ParamValue::forIndirect(V));
break;
}
Expand Down Expand Up @@ -4091,18 +4094,39 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
assert(NumIRArgs == 0);
assert(getTarget().getTriple().getArch() == llvm::Triple::x86);
if (I->isAggregate()) {
// Replace the placeholder with the appropriate argument slot GEP.
Address Addr = I->hasLValue()
? I->getKnownLValue().getAddress(*this)
: I->getKnownRValue().getAggregateAddress();
llvm::Instruction *Placeholder =
cast<llvm::Instruction>(Addr.getPointer());
CGBuilderTy::InsertPoint IP = Builder.saveIP();
Builder.SetInsertPoint(Placeholder);
Addr =
Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex());
Builder.restoreIP(IP);

if (!ArgInfo.getInAllocaIndirect()) {
// Replace the placeholder with the appropriate argument slot GEP.
CGBuilderTy::InsertPoint IP = Builder.saveIP();
Builder.SetInsertPoint(Placeholder);
Addr = Builder.CreateStructGEP(ArgMemory,
ArgInfo.getInAllocaFieldIndex());
Builder.restoreIP(IP);
} else {
// For indirect things such as overaligned structs, replace the
// placeholder with a regular aggregate temporary alloca. Store the
// address of this alloca into the struct.
Addr = CreateMemTemp(info_it->type, "inalloca.indirect.tmp");
Address ArgSlot = Builder.CreateStructGEP(
ArgMemory, ArgInfo.getInAllocaFieldIndex());
Builder.CreateStore(Addr.getPointer(), ArgSlot);
}
deferPlaceholderReplacement(Placeholder, Addr.getPointer());
} else if (ArgInfo.getInAllocaIndirect()) {
// Make a temporary alloca and store the address of it into the argument
// struct.
Address Addr = CreateMemTempWithoutCast(
I->Ty, getContext().getTypeAlignInChars(I->Ty),
"indirect-arg-temp");
I->copyInto(*this, Addr);
Address ArgSlot =
Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex());
Builder.CreateStore(Addr.getPointer(), ArgSlot);
} else {
// Store the RValue into the argument struct.
Address Addr =
Expand Down
74 changes: 50 additions & 24 deletions clang/lib/CodeGen/TargetInfo.cpp
Expand Up @@ -1702,6 +1702,7 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
bool IsVectorCall = State.CC == llvm::CallingConv::X86_VectorCall;

Ty = useFirstFieldIfTransparentUnion(Ty);
TypeInfo TI = getContext().getTypeInfo(Ty);

// Check with the C++ ABI first.
const RecordType *RT = Ty->getAs<RecordType>();
Expand Down Expand Up @@ -1751,7 +1752,7 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
bool NeedsPadding = false;
bool InReg;
if (shouldAggregateUseDirect(Ty, State, InReg, NeedsPadding)) {
unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32;
unsigned SizeInRegs = (TI.Width + 31) / 32;
SmallVector<llvm::Type*, 3> Elements(SizeInRegs, Int32);
llvm::Type *Result = llvm::StructType::get(LLVMContext, Elements);
if (InReg)
Expand All @@ -1761,29 +1762,44 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
}
llvm::IntegerType *PaddingType = NeedsPadding ? Int32 : nullptr;

// Pass over-aligned aggregates on Windows indirectly. This behavior was
// added in MSVC 2015.
if (IsWin32StructABI && TI.AlignIsRequired && TI.Align > 32)
return getIndirectResult(Ty, /*ByVal=*/false, State);

// Expand small (<= 128-bit) record types when we know that the stack layout
// of those arguments will match the struct. This is important because the
// LLVM backend isn't smart enough to remove byval, which inhibits many
// optimizations.
// Don't do this for the MCU if there are still free integer registers
// (see X86_64 ABI for full explanation).
if (getContext().getTypeSize(Ty) <= 4 * 32 &&
(!IsMCUABI || State.FreeRegs == 0) && canExpandIndirectArgument(Ty))
if (TI.Width <= 4 * 32 && (!IsMCUABI || State.FreeRegs == 0) &&
canExpandIndirectArgument(Ty))
return ABIArgInfo::getExpandWithPadding(
IsFastCall || IsVectorCall || IsRegCall, PaddingType);

return getIndirectResult(Ty, true, State);
}

if (const VectorType *VT = Ty->getAs<VectorType>()) {
// On Windows, vectors are passed directly if registers are available, or
// indirectly if not. This avoids the need to align argument memory. Pass
// user-defined vector types larger than 512 bits indirectly for simplicity.
if (IsWin32StructABI) {
if (TI.Width <= 512 && State.FreeSSERegs > 0) {
--State.FreeSSERegs;
return ABIArgInfo::getDirectInReg();
}
return getIndirectResult(Ty, /*ByVal=*/false, State);
}

// On Darwin, some vectors are passed in memory, we handle this by passing
// it as an i8/i16/i32/i64.
if (IsDarwinVectorABI) {
uint64_t Size = getContext().getTypeSize(Ty);
if ((Size == 8 || Size == 16 || Size == 32) ||
(Size == 64 && VT->getNumElements() == 1))
return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
Size));
if ((TI.Width == 8 || TI.Width == 16 || TI.Width == 32) ||
(TI.Width == 64 && VT->getNumElements() == 1))
return ABIArgInfo::getDirect(
llvm::IntegerType::get(getVMContext(), TI.Width));
}

if (IsX86_MMXType(CGT.ConvertType(Ty)))
Expand Down Expand Up @@ -1813,16 +1829,22 @@ void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const {
CCState State(FI);
if (IsMCUABI)
State.FreeRegs = 3;
else if (State.CC == llvm::CallingConv::X86_FastCall)
else if (State.CC == llvm::CallingConv::X86_FastCall) {
State.FreeRegs = 2;
else if (State.CC == llvm::CallingConv::X86_VectorCall) {
State.FreeSSERegs = 3;
} else if (State.CC == llvm::CallingConv::X86_VectorCall) {
State.FreeRegs = 2;
State.FreeSSERegs = 6;
} else if (FI.getHasRegParm())
State.FreeRegs = FI.getRegParm();
else if (State.CC == llvm::CallingConv::X86_RegCall) {
State.FreeRegs = 5;
State.FreeSSERegs = 8;
} else if (IsWin32StructABI) {
// Since MSVC 2015, the first three SSE vectors have been passed in
// registers. The rest are passed indirectly.
State.FreeRegs = DefaultNumRegisterParameters;
State.FreeSSERegs = 3;
} else
State.FreeRegs = DefaultNumRegisterParameters;

Expand Down Expand Up @@ -1869,16 +1891,25 @@ X86_32ABIInfo::addFieldToArgStruct(SmallVector<llvm::Type *, 6> &FrameFields,
CharUnits &StackOffset, ABIArgInfo &Info,
QualType Type) const {
// Arguments are always 4-byte-aligned.
CharUnits FieldAlign = CharUnits::fromQuantity(4);
CharUnits WordSize = CharUnits::fromQuantity(4);
assert(StackOffset.isMultipleOf(WordSize) && "unaligned inalloca struct");

assert(StackOffset.isMultipleOf(FieldAlign) && "unaligned inalloca struct");
Info = ABIArgInfo::getInAlloca(FrameFields.size());
FrameFields.push_back(CGT.ConvertTypeForMem(Type));
StackOffset += getContext().getTypeSizeInChars(Type);
// sret pointers and indirect things will require an extra pointer
// indirection, unless they are byval. Most things are byval, and will not
// require this indirection.
bool IsIndirect = false;
if (Info.isIndirect() && !Info.getIndirectByVal())
IsIndirect = true;
Info = ABIArgInfo::getInAlloca(FrameFields.size(), IsIndirect);
llvm::Type *LLTy = CGT.ConvertTypeForMem(Type);
if (IsIndirect)
LLTy = LLTy->getPointerTo(0);
FrameFields.push_back(LLTy);
StackOffset += IsIndirect ? WordSize : getContext().getTypeSizeInChars(Type);

// Insert padding bytes to respect alignment.
CharUnits FieldEnd = StackOffset;
StackOffset = FieldEnd.alignTo(FieldAlign);
StackOffset = FieldEnd.alignTo(WordSize);
if (StackOffset != FieldEnd) {
CharUnits NumBytes = StackOffset - FieldEnd;
llvm::Type *Ty = llvm::Type::getInt8Ty(getVMContext());
Expand All @@ -1892,16 +1923,12 @@ static bool isArgInAlloca(const ABIArgInfo &Info) {
switch (Info.getKind()) {
case ABIArgInfo::InAlloca:
return true;
case ABIArgInfo::Indirect:
assert(Info.getIndirectByVal());
return true;
case ABIArgInfo::Ignore:
return false;
case ABIArgInfo::Indirect:
case ABIArgInfo::Direct:
case ABIArgInfo::Extend:
if (Info.getInReg())
return false;
return true;
return !Info.getInReg();
case ABIArgInfo::Expand:
case ABIArgInfo::CoerceAndExpand:
// These are aggregate types which are never passed in registers when
Expand Down Expand Up @@ -1935,8 +1962,7 @@ void X86_32ABIInfo::rewriteWithInAlloca(CGFunctionInfo &FI) const {

// Put the sret parameter into the inalloca struct if it's in memory.
if (Ret.isIndirect() && !Ret.getInReg()) {
CanQualType PtrTy = getContext().getPointerType(FI.getReturnType());
addFieldToArgStruct(FrameFields, StackOffset, Ret, PtrTy);
addFieldToArgStruct(FrameFields, StackOffset, Ret, FI.getReturnType());
// On Windows, the hidden sret parameter is always returned in eax.
Ret.setInAllocaSRet(IsWin32StructABI);
}
Expand Down
44 changes: 44 additions & 0 deletions clang/test/CodeGen/x86_32-arguments-win32.c
Expand Up @@ -46,3 +46,47 @@ struct s6 {
struct s6 f6_1(void) { while (1) {} }
void f6_2(struct s6 a0) {}


// MSVC passes up to three vectors in registers, and the rest indirectly. We
// (arbitrarily) pass oversized vectors indirectly, since that is the safest way
// to do it.
typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
typedef float __m256 __attribute__((__vector_size__(32), __aligned__(32)));
typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
typedef float __m1024 __attribute__((__vector_size__(128), __aligned__(128)));

__m128 gv128;
__m256 gv256;
__m512 gv512;
__m1024 gv1024;

void receive_vec_128(__m128 x, __m128 y, __m128 z, __m128 w, __m128 q) {
gv128 = x + y + z + w + q;
}
void receive_vec_256(__m256 x, __m256 y, __m256 z, __m256 w, __m256 q) {
gv256 = x + y + z + w + q;
}
void receive_vec_512(__m512 x, __m512 y, __m512 z, __m512 w, __m512 q) {
gv512 = x + y + z + w + q;
}
void receive_vec_1024(__m1024 x, __m1024 y, __m1024 z, __m1024 w, __m1024 q) {
gv1024 = x + y + z + w + q;
}
// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* %0, <4 x float>* %1)
// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg %x, <8 x float> inreg %y, <8 x float> inreg %z, <8 x float>* %0, <8 x float>* %1)
// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg %x, <16 x float> inreg %y, <16 x float> inreg %z, <16 x float>* %0, <16 x float>* %1)
// CHECK-LABEL: define dso_local void @receive_vec_1024(<32 x float>* %0, <32 x float>* %1, <32 x float>* %2, <32 x float>* %3, <32 x float>* %4)

void pass_vec_128() {
__m128 z = {0};
receive_vec_128(z, z, z, z, z);
}

// CHECK-LABEL: define dso_local void @pass_vec_128()
// CHECK: call void @receive_vec_128(<4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float> inreg %{{[^,)]*}}, <4 x float>* %{{[^,)]*}}, <4 x float>* %{{[^,)]*}})


void __fastcall fastcall_indirect_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q) {
gv128 = x + y + z + w + q;
}
// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, <4 x float>* inreg %0, i32 inreg %edx, <4 x float>* %1)
52 changes: 52 additions & 0 deletions clang/test/CodeGenCXX/inalloca-overaligned.cpp
@@ -0,0 +1,52 @@
// RUN: %clang_cc1 -fms-extensions -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s

// PR44395
// MSVC passes overaligned types indirectly since MSVC 2015. Make sure that
// works with inalloca.

// FIXME: Pass non-trivial *and* overaligned types indirectly. Right now the C++
// ABI rules say to use inalloca, and they take precedence, so it's not easy to
// implement this.


struct NonTrivial {
NonTrivial();
NonTrivial(const NonTrivial &o);
int x;
};

struct __declspec(align(64)) OverAligned {
OverAligned();
int buf[16];
};

extern int gvi32;

int receive_inalloca_overaligned(NonTrivial nt, OverAligned o) {
return nt.x + o.buf[0];
}

// CHECK-LABEL: define dso_local i32 @"?receive_inalloca_overaligned@@Y{{.*}}"
// CHECK-SAME: (<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %0)

int pass_inalloca_overaligned() {
gvi32 = receive_inalloca_overaligned(NonTrivial(), OverAligned());
return gvi32;
}

// CHECK-LABEL: define dso_local i32 @"?pass_inalloca_overaligned@@Y{{.*}}"
// CHECK: [[TMP:%[^ ]*]] = alloca %struct.OverAligned, align 64
// CHECK: call i8* @llvm.stacksave()
// CHECK: alloca inalloca <{ %struct.NonTrivial, %struct.OverAligned* }>

// Construct OverAligned into TMP.
// CHECK: call x86_thiscallcc %struct.OverAligned* @"??0OverAligned@@QAE@XZ"(%struct.OverAligned* [[TMP]])

// Construct NonTrivial into the GEP.
// CHECK: [[GEP:%[^ ]*]] = getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 0
// CHECK: call x86_thiscallcc %struct.NonTrivial* @"??0NonTrivial@@QAE@XZ"(%struct.NonTrivial* [[GEP]])

// Store the address of an OverAligned temporary into the struct.
// CHECK: getelementptr inbounds <{ %struct.NonTrivial, %struct.OverAligned* }>, <{ %struct.NonTrivial, %struct.OverAligned* }>* %{{.*}}, i32 0, i32 1
// CHECK: store %struct.OverAligned* [[TMP]], %struct.OverAligned** %{{.*}}, align 4
// CHECK: call i32 @"?receive_inalloca_overaligned@@Y{{.*}}"(<{ %struct.NonTrivial, %struct.OverAligned* }>* inalloca %argmem)

0 comments on commit 2c6a389

Please sign in to comment.