Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Features.td
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,10 @@ def FeatureDisableUnpredicatedLdStLower : SubtargetFeature<
"disable-unpredicated-ld-st-lower", "DisableUnpredicatedLdStLower",
"true", "Disable lowering unpredicated loads/stores as LDR/STR">;

def FeatureMaxInterleaveFactor4 : SubtargetFeature<
"max-interleave-factor-4", "MaxInterleaveFactor", "4",
"Set the MaxInterleaveFactor to 4 (from the default 2)">;

//===----------------------------------------------------------------------===//
// Architectures.
//
Expand Down
72 changes: 48 additions & 24 deletions llvm/lib/Target/AArch64/AArch64Processors.td
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeatureAddrLSLSlow14,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
FeaturePredictableSelectIsExpensive,
FeatureMaxInterleaveFactor4]>;

def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
"Cortex-A65 ARM processors", [
Expand Down Expand Up @@ -307,7 +308,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
FeatureArithmeticBccFusion,
FeatureStorePairSuppress,
FeaturePredictableSelectIsExpensive,
FeatureDisableUnpredicatedLdStLower]>;
FeatureDisableUnpredicatedLdStLower,
FeatureMaxInterleaveFactor4]>;

def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
"Fujitsu FUJITSU-MONAKA processors", [
Expand All @@ -329,7 +331,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureUseFixedOverScalableIfEqualCost]>;
FeatureUseFixedOverScalableIfEqualCost,
FeatureMaxInterleaveFactor4]>;

// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targeting apple OSes.
Expand Down Expand Up @@ -432,7 +435,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;
FeatureZCZeroingFPR128,
FeatureMaxInterleaveFactor4]>;

def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
Expand All @@ -453,7 +457,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;
FeatureZCZeroingFPR128,
FeatureMaxInterleaveFactor4]>;

def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
Expand All @@ -474,7 +479,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;
FeatureZCZeroingFPR128,
FeatureMaxInterleaveFactor4]>;

def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
Expand All @@ -495,7 +501,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;
FeatureZCZeroingFPR128,
FeatureMaxInterleaveFactor4]>;

def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
Expand All @@ -515,7 +522,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;
FeatureZCZeroingFPR128,
FeatureMaxInterleaveFactor4]>;

def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
"Apple M5", [
Expand All @@ -535,7 +543,8 @@ def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;
FeatureZCZeroingFPR128,
FeatureMaxInterleaveFactor4]>;

def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
Expand All @@ -549,7 +558,8 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureStorePairSuppress,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive]>;
FeaturePredictableSelectIsExpensive,
FeatureMaxInterleaveFactor4]>;

// Re-uses some scheduling and tunings from the ExynosM3 proc family.
def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
Expand All @@ -568,7 +578,8 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureMaxInterleaveFactor4]>;

def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors", [
Expand All @@ -577,7 +588,8 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureALULSLFast,
FeatureStorePairSuppress]>;
FeatureStorePairSuppress,
FeatureMaxInterleaveFactor4]>;

def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
Expand All @@ -587,7 +599,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast,
FeatureSlowSTRQro]>;
FeatureSlowSTRQro,
FeatureMaxInterleaveFactor4]>;

def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
"Neoverse E1 ARM processors", [
Expand Down Expand Up @@ -635,7 +648,8 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
FeaturePredictableSelectIsExpensive,
FeatureMaxInterleaveFactor4]>;

def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
"Neoverse V1 ARM processors", [
Expand Down Expand Up @@ -664,7 +678,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureUseFixedOverScalableIfEqualCost,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive,
FeatureDisableLatencySchedHeuristic]>;
FeatureDisableLatencySchedHeuristic,
FeatureMaxInterleaveFactor4]>;

def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
"Neoverse V3 ARM processors", [
Expand All @@ -676,7 +691,8 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive]>;
FeaturePredictableSelectIsExpensive,
FeatureMaxInterleaveFactor4]>;

def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
"Neoverse V3AE ARM processors", [
Expand All @@ -688,7 +704,8 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive]>;
FeaturePredictableSelectIsExpensive,
FeatureMaxInterleaveFactor4]>;

def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
Expand All @@ -697,15 +714,17 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast]>;
FeatureALULSLFast,
FeatureMaxInterleaveFactor4]>;

def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
"Cavium ThunderX2 processors", [
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
FeaturePostRAScheduler,
FeatureStorePairSuppress,
FeaturePredictableSelectIsExpensive]>;
FeaturePredictableSelectIsExpensive,
FeatureMaxInterleaveFactor4]>;

def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
"ThunderX3T110",
Expand All @@ -716,7 +735,8 @@ def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
FeaturePredictableSelectIsExpensive,
FeatureBalanceFPOps,
FeatureStorePairSuppress,
FeatureStrictAlign]>;
FeatureStrictAlign,
FeatureMaxInterleaveFactor4]>;

def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
"Cavium ThunderX processors", [
Expand Down Expand Up @@ -764,7 +784,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
FeatureStpAlignedOnly]>;
FeatureStpAlignedOnly,
FeatureMaxInterleaveFactor4]>;

def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
"Ampere Computing Ampere-1A processors", [
Expand All @@ -780,7 +801,8 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
FeatureFuseAddSub2RegAndConstOne,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
FeatureStpAlignedOnly]>;
FeatureStpAlignedOnly,
FeatureMaxInterleaveFactor4]>;

def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
"Ampere Computing Ampere-1B processors", [
Expand All @@ -797,7 +819,8 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive,
FeatureLdpAlignedOnly,
FeatureStpAlignedOnly]>;
FeatureStpAlignedOnly,
FeatureMaxInterleaveFactor4]>;

def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
"Oryon",
Expand All @@ -820,7 +843,8 @@ def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
FeaturePerfMon,
FeatureSPE,
FeaturePostRAScheduler,
HasV8_6aOps]>;
HasV8_6aOps,
FeatureMaxInterleaveFactor4]>;

def ProcessorFeatures {
list<SubtargetFeature> A320 = [HasV9_2aOps, FeatureNEON, FeatureMTE,
Expand Down
15 changes: 0 additions & 15 deletions llvm/lib/Target/AArch64/AArch64Subtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 8;
break;
case CortexA57:
MaxInterleaveFactor = 4;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(16);
MaxBytesForLoopAlignment = 8;
Expand Down Expand Up @@ -199,7 +198,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 256;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
Expand All @@ -223,17 +221,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
PrefetchDistance = 280;
MinPrefetchStride = 2048;
MaxPrefetchIterationsAhead = 3;
if (isAppleMLike())
MaxInterleaveFactor = 4;
break;
case ExynosM3:
MaxInterleaveFactor = 4;
MaxJumpTableSize = 20;
PrefFunctionAlignment = Align(32);
PrefLoopAlignment = Align(16);
break;
case Falkor:
MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
CacheLineSize = 128;
Expand All @@ -242,7 +236,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxPrefetchIterationsAhead = 8;
break;
case Kryo:
MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
CacheLineSize = 128;
PrefetchDistance = 740;
Expand All @@ -263,7 +256,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case NeoverseV3:
CacheLineSize = 64;
EpilogueVectorizationMinVF = 8;
MaxInterleaveFactor = 4;
ScatterOverhead = 13;
[[fallthrough]];
case NeoverseN2:
Expand All @@ -283,18 +275,15 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case Neoverse512TVB:
PrefFunctionAlignment = Align(16);
VScaleForTuning = 1;
MaxInterleaveFactor = 4;
break;
case Saphira:
MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
case ThunderX2T99:
CacheLineSize = 64;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
Expand All @@ -320,7 +309,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(4);
MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
Expand All @@ -333,18 +321,15 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(64);
PrefLoopAlignment = Align(64);
MaxInterleaveFactor = 4;
break;
case Oryon:
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
break;
case Olympus:
EpilogueVectorizationMinVF = 8;
MaxInterleaveFactor = 4;
ScatterOverhead = 13;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(32);
Expand Down
Loading