Skip to content

Commit

Permalink
Adding tuning flags for int <-> fp domain switching penalties; NFC
Browse files Browse the repository at this point in the history
Atom
    - No domain switching penalties
Nehalem+
    - No penalty on moves
Haswell+
    - No penalty on moves / shuffles
Skylake+
    - No penality on moves / shuffles / blends

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D143859
  • Loading branch information
goldsteinn committed Feb 28, 2023
1 parent 7198c87 commit cecaf29
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 11 deletions.
65 changes: 54 additions & 11 deletions llvm/lib/Target/X86/X86.td
Expand Up @@ -527,6 +527,29 @@ def TuningFastVariablePerLaneShuffle
"HasFastVariablePerLaneShuffle",
"true", "Per-lane shuffles with variable masks are fast">;

// Goldmont / Tremont (atom in general) has no bypass delay
def TuningNoDomainDelay : SubtargetFeature<"no-bypass-delay",
"NoDomainDelay","true",
"Has no bypass delay when using the 'wrong' domain">;

// Many processors (Nehalem+ on Intel) have no bypass delay when
// using the wrong mov type.
def TuningNoDomainDelayMov : SubtargetFeature<"no-bypass-delay-mov",
"NoDomainDelayMov","true",
"Has no bypass delay when using the 'wrong' mov type">;

// Newer processors (Skylake+ on Intel) have no bypass delay when
// using the wrong blend type.
def TuningNoDomainDelayBlend : SubtargetFeature<"no-bypass-delay-blend",
"NoDomainDelayBlend","true",
"Has no bypass delay when using the 'wrong' blend type">;

// Newer processors (Haswell+ on Intel) have no bypass delay when
// using the wrong shuffle type.
def TuningNoDomainDelayShuffle : SubtargetFeature<"no-bypass-delay-shuffle",
"NoDomainDelayShuffle","true",
"Has no bypass delay when using the 'wrong' shuffle type">;

// On some X86 processors, a vzeroupper instruction should be inserted after
// using ymm/zmm registers before executing code that may use SSE instructions.
def TuningInsertVZEROUPPER
Expand Down Expand Up @@ -781,7 +804,8 @@ def ProcessorFeatures {
// Nehalem
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningNoDomainDelayMov];

// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
Expand All @@ -801,7 +825,8 @@ def ProcessorFeatures {
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningNoDomainDelayMov];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);

Expand Down Expand Up @@ -833,7 +858,9 @@ def ProcessorFeatures {
TuningPOPCNTFalseDeps,
TuningLZCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit];
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle];
list<SubtargetFeature> HSWFeatures =
!listconcat(IVBFeatures, HSWAdditionalFeatures);

Expand Down Expand Up @@ -862,7 +889,10 @@ def ProcessorFeatures {
TuningFastVariablePerLaneShuffle,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit];
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);

Expand Down Expand Up @@ -891,7 +921,10 @@ def ProcessorFeatures {
TuningPrefer256Bit,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit];
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);

Expand Down Expand Up @@ -929,7 +962,10 @@ def ProcessorFeatures {
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit];
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);

Expand All @@ -954,7 +990,10 @@ def ProcessorFeatures {
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit];
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);

Expand Down Expand Up @@ -1028,7 +1067,8 @@ def ProcessorFeatures {
TuningSlowTwoMemOps,
TuningLEAUsesAG,
TuningPadShortFunctions,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningNoDomainDelay];

// Silvermont
list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
Expand All @@ -1046,7 +1086,8 @@ def ProcessorFeatures {
TuningFast7ByteNOP,
TuningFastMOVBE,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningNoDomainDelay];
list<SubtargetFeature> SLMFeatures =
!listconcat(AtomFeatures, SLMAdditionalFeatures);

Expand All @@ -1066,7 +1107,8 @@ def ProcessorFeatures {
TuningSlowIncDec,
TuningFastMOVBE,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningNoDomainDelay];
list<SubtargetFeature> GLMFeatures =
!listconcat(SLMFeatures, GLMAdditionalFeatures);

Expand All @@ -1078,7 +1120,8 @@ def ProcessorFeatures {
TuningSlowLEA,
TuningSlowIncDec,
TuningFastMOVBE,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningNoDomainDelay];
list<SubtargetFeature> GLPFeatures =
!listconcat(GLMFeatures, GLPAdditionalFeatures);

Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/X86/X86Subtarget.h
Expand Up @@ -249,6 +249,17 @@ class X86Subtarget final : public X86GenSubtargetInfo {
return hasBWI() && canExtendTo512DQ();
}

bool hasNoDomainDelay() const { return NoDomainDelay; }
bool hasNoDomainDelayMov() const {
return hasNoDomainDelay() || NoDomainDelayMov;
}
bool hasNoDomainDelayBlend() const {
return hasNoDomainDelay() || NoDomainDelayBlend;
}
bool hasNoDomainDelayShuffle() const {
return hasNoDomainDelay() || NoDomainDelayShuffle;
}

// If there are no 512-bit vectors and we prefer not to use 512-bit registers,
// disable them in the legalizer.
bool useAVX512Regs() const {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86TargetTransformInfo.h
Expand Up @@ -88,6 +88,10 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningInsertVZEROUPPER,
X86::TuningUseSLMArithCosts,
X86::TuningUseGLMDivSqrtCosts,
X86::TuningNoDomainDelay,
X86::TuningNoDomainDelayMov,
X86::TuningNoDomainDelayShuffle,
X86::TuningNoDomainDelayBlend,

// Perf-tuning flags.
X86::TuningFastGather,
Expand Down

0 comments on commit cecaf29

Please sign in to comment.