-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AArch64] Turn MaxInterleaveFactor into a subtarget feature #171088
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThe default value for MaxInterleaveFactor is 2, but some CPUs prefer a wider factor of 4. This adds a subtarget feature so that cpus can override the default in their tuning features, keeping more of the options together in one place. Full diff: https://github.com/llvm/llvm-project/pull/171088.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 066724bea92c9..b6275329c8672 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -915,6 +915,10 @@ def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
"UseWzrToVecMove", "true",
"Move from WZR to insert 0 into vector registers">;
+def FeatureMaxInterleaveFactor4 : SubtargetFeature<
+ "max-interleave-factor-4", "MaxInterleaveFactor", "4",
+ "Set the MaxInterleaveFactor to 4 (from the default 2)">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 120415f91c9ae..31990c6d4e222 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -74,7 +74,8 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeatureAddrLSLSlow14,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
"Cortex-A65 ARM processors", [
@@ -306,7 +307,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
FeatureStorePairSuppress,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
"Fujitsu FUJITSU-MONAKA processors", [
@@ -328,7 +330,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureUseFixedOverScalableIfEqualCost]>;
+ FeatureUseFixedOverScalableIfEqualCost,
+ FeatureMaxInterleaveFactor4]>;
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targeting apple OSes.
@@ -431,7 +434,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
@@ -452,7 +456,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
@@ -473,7 +478,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
@@ -494,7 +500,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
@@ -514,7 +521,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
@@ -528,7 +536,8 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureStorePairSuppress,
FeatureALULSLFast,
FeaturePostRAScheduler,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
// Re-uses some scheduling and tunings from the ExynosM3 proc family.
def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
@@ -556,7 +565,8 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureALULSLFast,
- FeatureStorePairSuppress]>;
+ FeatureStorePairSuppress,
+ FeatureMaxInterleaveFactor4]>;
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
@@ -566,7 +576,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast,
- FeatureSlowSTRQro]>;
+ FeatureSlowSTRQro,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
"Neoverse E1 ARM processors", [
@@ -614,7 +625,8 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
"Neoverse V1 ARM processors", [
@@ -643,7 +655,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureUseFixedOverScalableIfEqualCost,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive,
- FeatureDisableLatencySchedHeuristic]>;
+ FeatureDisableLatencySchedHeuristic,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
"Neoverse V3 ARM processors", [
@@ -655,7 +668,8 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
"Neoverse V3AE ARM processors", [
@@ -676,7 +690,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
- FeatureALULSLFast]>;
+ FeatureALULSLFast,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
"Cavium ThunderX2 processors", [
@@ -684,7 +699,8 @@ def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "Thund
FeatureArithmeticBccFusion,
FeaturePostRAScheduler,
FeatureStorePairSuppress,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
"ThunderX3T110",
@@ -695,7 +711,8 @@ def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
FeaturePredictableSelectIsExpensive,
FeatureBalanceFPOps,
FeatureStorePairSuppress,
- FeatureStrictAlign]>;
+ FeatureStrictAlign,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
"Cavium ThunderX processors", [
@@ -743,7 +760,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
"Ampere Computing Ampere-1A processors", [
@@ -759,7 +777,8 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
FeatureFuseAddSub2RegAndConstOne,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
"Ampere Computing Ampere-1B processors", [
@@ -776,7 +795,8 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
"Oryon",
@@ -799,7 +819,8 @@ def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
FeaturePerfMon,
FeatureSPE,
FeaturePostRAScheduler,
- HasV8_6aOps]>;
+ HasV8_6aOps,
+ FeatureMaxInterleaveFactor4]>;
def ProcessorFeatures {
list<SubtargetFeature> A320 = [HasV9_2aOps, FeatureNEON, FeatureMTE,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 53b00e83a36b3..df5dab31df9f9 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -149,7 +149,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 8;
break;
case CortexA57:
- MaxInterleaveFactor = 4;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(16);
MaxBytesForLoopAlignment = 8;
@@ -199,7 +198,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 256;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -222,26 +220,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
PrefetchDistance = 280;
MinPrefetchStride = 2048;
MaxPrefetchIterationsAhead = 3;
- switch (ARMProcFamily) {
- case AppleA14:
- case AppleA15:
- case AppleA16:
- case AppleA17:
- case AppleM4:
- MaxInterleaveFactor = 4;
- break;
- default:
- break;
- }
break;
case ExynosM3:
- MaxInterleaveFactor = 4;
MaxJumpTableSize = 20;
PrefFunctionAlignment = Align(32);
PrefLoopAlignment = Align(16);
break;
case Falkor:
- MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
CacheLineSize = 128;
@@ -250,7 +235,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxPrefetchIterationsAhead = 8;
break;
case Kryo:
- MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
CacheLineSize = 128;
PrefetchDistance = 740;
@@ -271,7 +255,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case NeoverseV3:
CacheLineSize = 64;
EpilogueVectorizationMinVF = 8;
- MaxInterleaveFactor = 4;
ScatterOverhead = 13;
[[fallthrough]];
case NeoverseN2:
@@ -291,10 +274,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case Neoverse512TVB:
PrefFunctionAlignment = Align(16);
VScaleForTuning = 1;
- MaxInterleaveFactor = 4;
break;
case Saphira:
- MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
@@ -302,7 +283,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -328,7 +308,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -341,18 +320,15 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(64);
PrefLoopAlignment = Align(64);
- MaxInterleaveFactor = 4;
break;
case Oryon:
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
break;
case Olympus:
EpilogueVectorizationMinVF = 8;
- MaxInterleaveFactor = 4;
ScatterOverhead = 13;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(32);
|
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks.
This presumably also allows specifying the target feature to use max interleave factor 4 for CPUs that do not have it (or disable it via the target-feature if for CPUs that do). MIght be good to add a test that just uses the target feature?
The default value for MaxInterleaveFactor is 2, but some CPUs prefer a wider factor of 4. This adds a subtarget feature so that cpus can override the default in their tuning features, keeping more of the options together in one place.
e67e073 to
89c0380
Compare
The default value for MaxInterleaveFactor is 2, but some CPUs prefer a wider factor of 4. This adds a subtarget feature so that cpus can override the default in their tuning features, keeping more of the options together in one place.