Skip to content

Commit

Permalink
[X86] Support arch=x86-64{,-v2,-v3,-v4} for target_clones attribute
Browse files Browse the repository at this point in the history
GCC 12 (https://gcc.gnu.org/PR101696) allows `arch=x86-64`
`arch=x86-64-v2` `arch=x86-64-v3` `arch=x86-64-v4` in the
target_clones function attribute. This patch ports the feature.

* Set KeyFeature to `x86-64{,-v2,-v3,-v4}` in `Processors[]`, to be used
  by X86TargetInfo::multiVersionSortPriority
* builtins: change `__cpu_features2` to an array like libgcc. Define
  `FEATURE_X86_64_{BASELINE,V2,V3,V4}` and depended ISA feature bits.
* CGBuiltin.cpp: update EmitX86CpuSupports to handle `arch=x86-64*`.

Close #55830

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D158329
  • Loading branch information
MaskRay committed Aug 24, 2023
1 parent 2289c7f commit 7a41af8
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 34 deletions.
38 changes: 21 additions & 17 deletions clang/lib/CodeGen/CGBuiltin.cpp
Expand Up @@ -54,6 +54,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/MatrixBuilder.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ScopedPrinter.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/TargetParser/X86TargetParser.h"
Expand Down Expand Up @@ -13324,16 +13325,15 @@ Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
}

Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
uint64_t Mask = llvm::X86::getCpuSupportsMask(FeatureStrs);
std::array<uint32_t, 4> FeatureMask{Lo_32(Mask), Hi_32(Mask), 0, 0};
return EmitX86CpuSupports(FeatureMask);
}

llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) {
uint32_t Features1 = Lo_32(FeaturesMask);
uint32_t Features2 = Hi_32(FeaturesMask);

llvm::Value *
CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
Value *Result = Builder.getTrue();

if (Features1 != 0) {
if (FeatureMask[0] != 0) {
// Matching the struct layout from the compiler-rt/libgcc structure that is
// filled in:
// unsigned int __cpu_vendor;
Expand All @@ -13356,22 +13356,26 @@ llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) {
CharUnits::fromQuantity(4));

// Check the value of the bit corresponding to the feature requested.
Value *Mask = Builder.getInt32(Features1);
Value *Mask = Builder.getInt32(FeatureMask[0]);
Value *Bitset = Builder.CreateAnd(Features, Mask);
Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
Result = Builder.CreateAnd(Result, Cmp);
}

if (Features2 != 0) {
llvm::Constant *CpuFeatures2 = CGM.CreateRuntimeVariable(Int32Ty,
"__cpu_features2");
cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);

Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures2,
CharUnits::fromQuantity(4));

llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3);
llvm::Constant *CpuFeatures2 =
CGM.CreateRuntimeVariable(ATy, "__cpu_features2");
cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
for (int i = 1; i != 4; ++i) {
const uint32_t M = FeatureMask[i];
if (!M)
continue;
Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
Value *Features = Builder.CreateAlignedLoad(
Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs),
CharUnits::fromQuantity(4));
// Check the value of the bit corresponding to the feature requested.
Value *Mask = Builder.getInt32(Features2);
Value *Mask = Builder.getInt32(M);
Value *Bitset = Builder.CreateAnd(Features, Mask);
Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
Result = Builder.CreateAnd(Result, Cmp);
Expand Down
23 changes: 21 additions & 2 deletions clang/lib/CodeGen/CodeGenFunction.cpp
Expand Up @@ -2681,8 +2681,27 @@ llvm::Value *CodeGenFunction::FormX86ResolverCondition(
const MultiVersionResolverOption &RO) {
llvm::Value *Condition = nullptr;

if (!RO.Conditions.Architecture.empty())
Condition = EmitX86CpuIs(RO.Conditions.Architecture);
if (!RO.Conditions.Architecture.empty()) {
StringRef Arch = RO.Conditions.Architecture;
std::array<uint32_t, 4> Mask{};
// If arch= specifies an x86-64 micro-architecture level, test a special
// feature named FEATURE_X86_64_*, otherwise we use __builtin_cpu_is.
if (Arch.consume_front("x86-64")) {
if (Arch.empty()) // FEATURE_X86_64_BASELINE 95=2*32+31
Mask[2] = 1u << 31;
else if (Arch == "-v2") // FEATURE_X86_64_V2 96==3*32+0
Mask[3] = 1u << 0;
else if (Arch == "-v3") // FEATURE_X86_64_V3 97==3*32+1
Mask[3] = 1u << 1;
else if (Arch == "-v4") // FEATURE_X86_64_V3 98==3*32+2
Mask[3] = 1u << 2;
else
llvm_unreachable("invalid x86-64 micro-architecture level");
Condition = EmitX86CpuSupports(Mask);
} else {
Condition = EmitX86CpuIs(Arch);
}
}

if (!RO.Conditions.Features.empty()) {
llvm::Value *FeatureCond = EmitX86CpuSupports(RO.Conditions.Features);
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/CodeGen/CodeGenFunction.h
Expand Up @@ -4902,7 +4902,7 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *EmitX86CpuIs(StringRef CPUStr);
llvm::Value *EmitX86CpuSupports(const CallExpr *E);
llvm::Value *EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs);
llvm::Value *EmitX86CpuSupports(uint64_t Mask);
llvm::Value *EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask);
llvm::Value *EmitX86CpuInit();
llvm::Value *FormX86ResolverCondition(const MultiVersionResolverOption &RO);
llvm::Value *EmitAArch64CpuInit();
Expand Down
25 changes: 25 additions & 0 deletions clang/test/CodeGen/attr-target-clones.c
Expand Up @@ -13,6 +13,9 @@
// WINDOWS: $foo_inline = comdat any
// WINDOWS: $foo_inline2 = comdat any

// LINUX: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] }
// LINUX: @__cpu_features2 = external dso_local global [3 x i32]

// LINUX: @foo.ifunc = weak_odr ifunc i32 (), ptr @foo.resolver
// LINUX: @foo_dupes.ifunc = weak_odr ifunc void (), ptr @foo_dupes.resolver
// LINUX: @unused.ifunc = weak_odr ifunc void (), ptr @unused.resolver
Expand Down Expand Up @@ -137,6 +140,28 @@ int test_foo_used_no_defn(void) {
// WINDOWS: musttail call i32 @foo_used_no_defn.sse4.2.0
// WINDOWS: musttail call i32 @foo_used_no_defn.default.1

__attribute__((target_clones("default", "arch=x86-64", "arch=x86-64-v2", "arch=x86-64-v3", "arch=x86-64-v4")))
int isa_level(int) { return 0; }
// LINUX: define{{.*}} i32 @isa_level.default.4(
// LINUX: define{{.*}} i32 @isa_level.arch_x86-64.0(
// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v2.1(
// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v3.2(
// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v4.3(
// LINUX: define weak_odr ptr @isa_level.resolver() comdat
// LINUX: call void @__cpu_indicator_init()
// LINUX-NEXT: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
// LINUX-NEXT: and i32 %[[#]], 4
// LINUX: ret ptr @isa_level.arch_x86-64-v4.3
// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
// LINUX-NEXT: and i32 %[[#]], 2
// LINUX: ret ptr @isa_level.arch_x86-64-v3.2
// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
// LINUX-NEXT: and i32 %[[#]], 1
// LINUX: ret ptr @isa_level.arch_x86-64-v2.1
// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 1)
// LINUX-NEXT: and i32 %[[#]], -2147483648
// LINUX: ret ptr @isa_level.arch_x86-64.0
// LINUX: ret ptr @isa_level.default.4

// Deferred emission of inline definitions.

Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGen/builtin-cpu-supports.c
Expand Up @@ -5,7 +5,7 @@
extern void a(const char *);

// CHECK: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] }
// CHECK: @__cpu_features2 = external dso_local global i32
// CHECK: @__cpu_features2 = external dso_local global [3 x i32]

int main(void) {
__builtin_cpu_init();
Expand Down
4 changes: 4 additions & 0 deletions clang/test/Sema/attr-target-clones.c
Expand Up @@ -118,3 +118,7 @@ void __attribute__((__overloadable__)) good_overload4(int) __attribute__((target
// expected-error@+1 {{attribute 'target_clones' multiversioning cannot be combined with attribute 'overloadable'}}
void __attribute__((__overloadable__)) good_overload5(void) __attribute__((target_clones("mmx", "sse4.2", "default")));
void good_overload5(int) __attribute__((target_clones("mmx", "sse4.2", "default")));

void good_isa_level(int) __attribute__((target_clones("default", "arch=x86-64", "arch=x86-64-v2", "arch=x86-64-v3", "arch=x86-64-v4")));
// expected-warning@+1 {{unsupported CPU 'x86-64-v5' in the 'target_clones' attribute string; 'target_clones' attribute ignored}}
void bad_isa_level(int) __attribute__((target_clones("default", "arch=x86-64-v5")));
68 changes: 59 additions & 9 deletions compiler-rt/lib/builtins/cpu_model.c
Expand Up @@ -158,6 +158,19 @@ enum ProcessorFeatures {
FEATURE_AVX512BITALG,
FEATURE_AVX512BF16,
FEATURE_AVX512VP2INTERSECT,

FEATURE_CMPXCHG16B = 46,
FEATURE_F16C = 49,
FEATURE_LAHF_LM = 54,
FEATURE_LM,
FEATURE_WP,
FEATURE_LZCNT,
FEATURE_MOVBE,

FEATURE_X86_64_BASELINE = 95,
FEATURE_X86_64_V2,
FEATURE_X86_64_V3,
FEATURE_X86_64_V4,
CPU_FEATURE_MAX
};

Expand Down Expand Up @@ -677,6 +690,7 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
unsigned *Features) {
unsigned EAX = 0, EBX = 0;

#define hasFeature(F) ((Features[F / 32] >> (F % 32)) & 1)
#define setFeature(F) \
Features[F / 32] |= 1U << (F % 32)

Expand All @@ -697,14 +711,20 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
setFeature(FEATURE_SSSE3);
if ((ECX >> 12) & 1)
setFeature(FEATURE_FMA);
if ((ECX >> 13) & 1)
setFeature(FEATURE_CMPXCHG16B);
if ((ECX >> 19) & 1)
setFeature(FEATURE_SSE4_1);
if ((ECX >> 20) & 1)
setFeature(FEATURE_SSE4_2);
if ((ECX >> 22) & 1)
setFeature(FEATURE_MOVBE);
if ((ECX >> 23) & 1)
setFeature(FEATURE_POPCNT);
if ((ECX >> 25) & 1)
setFeature(FEATURE_AES);
if ((ECX >> 29) & 1)
setFeature(FEATURE_F16C);

// If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
// indicates that the AVX registers will be saved and restored on context
Expand Down Expand Up @@ -786,12 +806,39 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,

bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
!getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
if (HasExtLeaf1 && ((ECX >> 6) & 1))
setFeature(FEATURE_SSE4_A);
if (HasExtLeaf1 && ((ECX >> 11) & 1))
setFeature(FEATURE_XOP);
if (HasExtLeaf1 && ((ECX >> 16) & 1))
setFeature(FEATURE_FMA4);
if (HasExtLeaf1) {
if (ECX & 1)
setFeature(FEATURE_LAHF_LM);
if ((ECX >> 5) & 1)
setFeature(FEATURE_LZCNT);
if (((ECX >> 6) & 1))
setFeature(FEATURE_SSE4_A);
if (((ECX >> 11) & 1))
setFeature(FEATURE_XOP);
if (((ECX >> 16) & 1))
setFeature(FEATURE_FMA4);
if (((EDX >> 29) & 1))
setFeature(FEATURE_LM);
}

if (hasFeature(FEATURE_LM) && hasFeature(FEATURE_SSE2)) {
setFeature(FEATURE_X86_64_BASELINE);
if (hasFeature(FEATURE_CMPXCHG16B) && hasFeature(FEATURE_POPCNT) &&
hasFeature(FEATURE_LAHF_LM) && hasFeature(FEATURE_SSE4_2)) {
setFeature(FEATURE_X86_64_V2);
if (hasFeature(FEATURE_AVX2) && hasFeature(FEATURE_BMI) &&
hasFeature(FEATURE_BMI2) && hasFeature(FEATURE_F16C) &&
hasFeature(FEATURE_FMA) && hasFeature(FEATURE_LZCNT) &&
hasFeature(FEATURE_MOVBE)) {
setFeature(FEATURE_X86_64_V3);
if (hasFeature(FEATURE_AVX512BW) && hasFeature(FEATURE_AVX512CD) &&
hasFeature(FEATURE_AVX512DQ) && hasFeature(FEATURE_AVX512VL))
setFeature(FEATURE_X86_64_V4);
}
}
}

#undef hasFeature
#undef setFeature
}

Expand All @@ -813,7 +860,7 @@ struct __processor_model {
#ifndef _WIN32
__attribute__((visibility("hidden")))
#endif
unsigned int __cpu_features2 = 0;
unsigned __cpu_features2[(CPU_FEATURE_MAX - 1) / 32];

// A constructor function that is sets __cpu_model and __cpu_features2 with
// the right values. This needs to run only once. This constructor is
Expand All @@ -827,6 +874,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
unsigned Vendor;
unsigned Model, Family;
unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");

// This function needs to run just once.
if (__cpu_model.__cpu_vendor)
Expand All @@ -844,9 +893,10 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
// Find available features.
getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]);

assert((sizeof(Features)/sizeof(Features[0])) == 2);
__cpu_model.__cpu_features[0] = Features[0];
__cpu_features2 = Features[1];
__cpu_features2[0] = Features[1];
__cpu_features2[1] = Features[2];
__cpu_features2[2] = Features[3];

if (Vendor == SIG_INTEL) {
// Get CPU type.
Expand Down
10 changes: 6 additions & 4 deletions llvm/lib/TargetParser/X86TargetParser.cpp
Expand Up @@ -237,6 +237,7 @@ static constexpr FeatureBitset FeaturesZNVER4 =
// listed here before, which means it doesn't support -march, -mtune and so on.
// FIXME: Remove OnlyForCPUDispatchSpecific after all CPUs here support both
// cpu_dispatch/specific() feature and -march, -mtune, and so on.
// clang-format off
constexpr ProcInfo Processors[] = {
// Empty processor. Include X87 and CMPXCHG8 for backwards compatibility.
{ {""}, CK_None, ~0U, FeatureX87 | FeatureCMPXCHG8B, '\0', false },
Expand Down Expand Up @@ -404,13 +405,14 @@ constexpr ProcInfo Processors[] = {
{ {"znver3"}, CK_ZNVER3, FEATURE_AVX2, FeaturesZNVER3, '\0', false },
{ {"znver4"}, CK_ZNVER4, FEATURE_AVX512VBMI2, FeaturesZNVER4, '\0', false },
// Generic 64-bit processor.
{ {"x86-64"}, CK_x86_64, ~0U, FeaturesX86_64, '\0', false },
{ {"x86-64-v2"}, CK_x86_64_v2, ~0U, FeaturesX86_64_V2, '\0', false },
{ {"x86-64-v3"}, CK_x86_64_v3, ~0U, FeaturesX86_64_V3, '\0', false },
{ {"x86-64-v4"}, CK_x86_64_v4, ~0U, FeaturesX86_64_V4, '\0', false },
{ {"x86-64"}, CK_x86_64, FEATURE_SSE2 , FeaturesX86_64, '\0', false },
{ {"x86-64-v2"}, CK_x86_64_v2, FEATURE_SSE4_2 , FeaturesX86_64_V2, '\0', false },
{ {"x86-64-v3"}, CK_x86_64_v3, FEATURE_AVX2, FeaturesX86_64_V3, '\0', false },
{ {"x86-64-v4"}, CK_x86_64_v4, FEATURE_AVX512VL, FeaturesX86_64_V4, '\0', false },
// Geode processors.
{ {"geode"}, CK_Geode, ~0U, FeaturesGeode, '\0', false },
};
// clang-format on

constexpr const char *NoTuneList[] = {"x86-64-v2", "x86-64-v3", "x86-64-v4"};

Expand Down

2 comments on commit 7a41af8

@chfast
Copy link
Contributor

@chfast chfast commented on 7a41af8 Aug 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this also fix #59961 ?

@MaskRay
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this also fix #59961 ?

No. __builtin_cpu_support("x86-64-v2") is not supported. I'll work on it.

Please sign in to comment.