-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Fast AVX-512-VNNI vpdpwssd tuning #85375
Conversation
Change-Id: I2eeac5c6c4aadd15a51abdd457c36aa641cf6e8c
@llvm/pr-subscribers-backend-x86 Author: Ganesh (ganeshgit) ChangesAdding a tuning feature to resolve #84182. Generation of vpdpwssd (instead of vpmaddwd + vpaddd sequence) will be retained when the tuning is enabled. Recreating after the previous fixup got broken. Full diff: https://github.com/llvm/llvm-project/pull/85375.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddfa..78bc043911f2fc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastDPWSSD
+ : SubtargetFeature<
+ "fast-dpwssd", "HasFastDPWSSD", "true",
+ "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
def TuningPreferNoGather
: SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
"Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
!listconcat(ZN2Tuning, ZN3AdditionalTuning);
list<SubtargetFeature> ZN3Features =
!listconcat(ZN2Features, ZN3AdditionalFeatures);
- list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+ list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD];
+ list<SubtargetFeature> ZN4Tuning =
+ !listconcat(ZN3Tuning, ZN4AdditionalTuning);
list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
FeatureEVEX512,
FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5dd..43ae6fd590745c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10565,15 +10565,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
bool DoRegPressureReduce) const {
unsigned Opc = Root.getOpcode();
switch (Opc) {
- default:
- return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
- DoRegPressureReduce);
case X86::VPDPWSSDrr:
case X86::VPDPWSSDrm:
case X86::VPDPWSSDYrr:
case X86::VPDPWSSDYrm: {
- Patterns.push_back(MachineCombinerPattern::DPWSSD);
- return true;
+ if (!Subtarget.hasFastDPWSSD()) {
+ Patterns.push_back(MachineCombinerPattern::DPWSSD);
+ return true;
+ }
+ break;
}
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128m:
@@ -10581,11 +10581,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
case X86::VPDPWSSDZ256m:
case X86::VPDPWSSDZr:
case X86::VPDPWSSDZm: {
- if (Subtarget.hasBWI())
+ if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
Patterns.push_back(MachineCombinerPattern::DPWSSD);
- return true;
+ return true;
+ }
+ break;
}
}
+ return TargetInstrInfo::getMachineCombinerPatterns(Root,
+ Patterns, DoRegPressureReduce);
}
static void
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027ae..b8e7768bdaf3c4 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -238,5 +238,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
+def HasFastDPWSSD: Predicate<"Subtarget->hasFastDPWSSD()">;
def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa67..23035f655098a7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningNoDomainDelayBlend,
X86::TuningPreferShiftShuffle,
X86::TuningFastImmVectorShift,
+ X86::TuningFastDPWSSD,
// Perf-tuning flags.
X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 00000000000000..e6a07b4aeb2719
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+ ret <16 x i32> %4
+}
|
You can test this locally with the following command:git-clang-format --diff 141145232f915b44aef6e3854f091da03c41a2b6 df56289a07dd121257df6b902dec16a0cb823ed8 -- llvm/lib/Target/X86/X86InstrInfo.cpp llvm/lib/Target/X86/X86TargetTransformInfo.h View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 43ae6fd590..bf3907a853 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10581,15 +10581,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
case X86::VPDPWSSDZ256m:
case X86::VPDPWSSDZr:
case X86::VPDPWSSDZm: {
- if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
+ if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
Patterns.push_back(MachineCombinerPattern::DPWSSD);
return true;
}
break;
}
}
- return TargetInstrInfo::getMachineCombinerPatterns(Root,
- Patterns, DoRegPressureReduce);
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+ DoRegPressureReduce);
}
static void
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 23035f6550..bdaf9b2f13 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -42,72 +42,43 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureX86_64,
// These features don't have any intrinsics or ABI effect.
- X86::FeatureNOPL,
- X86::FeatureCX16,
- X86::FeatureLAHFSAHF64,
+ X86::FeatureNOPL, X86::FeatureCX16, X86::FeatureLAHFSAHF64,
// Some older targets can be setup to fold unaligned loads.
X86::FeatureSSEUnalignedMem,
// Codegen control options.
- X86::TuningFast11ByteNOP,
- X86::TuningFast15ByteNOP,
- X86::TuningFastBEXTR,
- X86::TuningFastHorizontalOps,
- X86::TuningFastLZCNT,
- X86::TuningFastScalarFSQRT,
- X86::TuningFastSHLDRotate,
- X86::TuningFastScalarShiftMasks,
- X86::TuningFastVectorShiftMasks,
+ X86::TuningFast11ByteNOP, X86::TuningFast15ByteNOP, X86::TuningFastBEXTR,
+ X86::TuningFastHorizontalOps, X86::TuningFastLZCNT,
+ X86::TuningFastScalarFSQRT, X86::TuningFastSHLDRotate,
+ X86::TuningFastScalarShiftMasks, X86::TuningFastVectorShiftMasks,
X86::TuningFastVariableCrossLaneShuffle,
- X86::TuningFastVariablePerLaneShuffle,
- X86::TuningFastVectorFSQRT,
- X86::TuningLEAForSP,
- X86::TuningLEAUsesAG,
- X86::TuningLZCNTFalseDeps,
- X86::TuningBranchFusion,
- X86::TuningMacroFusion,
- X86::TuningPadShortFunctions,
- X86::TuningPOPCNTFalseDeps,
- X86::TuningMULCFalseDeps,
- X86::TuningPERMFalseDeps,
- X86::TuningRANGEFalseDeps,
- X86::TuningGETMANTFalseDeps,
- X86::TuningMULLQFalseDeps,
- X86::TuningSlow3OpsLEA,
- X86::TuningSlowDivide32,
- X86::TuningSlowDivide64,
- X86::TuningSlowIncDec,
- X86::TuningSlowLEA,
- X86::TuningSlowPMADDWD,
- X86::TuningSlowPMULLD,
- X86::TuningSlowSHLD,
- X86::TuningSlowTwoMemOps,
- X86::TuningSlowUAMem16,
- X86::TuningPreferMaskRegisters,
- X86::TuningInsertVZEROUPPER,
- X86::TuningUseSLMArithCosts,
- X86::TuningUseGLMDivSqrtCosts,
- X86::TuningNoDomainDelay,
- X86::TuningNoDomainDelayMov,
- X86::TuningNoDomainDelayShuffle,
- X86::TuningNoDomainDelayBlend,
- X86::TuningPreferShiftShuffle,
- X86::TuningFastImmVectorShift,
+ X86::TuningFastVariablePerLaneShuffle, X86::TuningFastVectorFSQRT,
+ X86::TuningLEAForSP, X86::TuningLEAUsesAG, X86::TuningLZCNTFalseDeps,
+ X86::TuningBranchFusion, X86::TuningMacroFusion,
+ X86::TuningPadShortFunctions, X86::TuningPOPCNTFalseDeps,
+ X86::TuningMULCFalseDeps, X86::TuningPERMFalseDeps,
+ X86::TuningRANGEFalseDeps, X86::TuningGETMANTFalseDeps,
+ X86::TuningMULLQFalseDeps, X86::TuningSlow3OpsLEA,
+ X86::TuningSlowDivide32, X86::TuningSlowDivide64, X86::TuningSlowIncDec,
+ X86::TuningSlowLEA, X86::TuningSlowPMADDWD, X86::TuningSlowPMULLD,
+ X86::TuningSlowSHLD, X86::TuningSlowTwoMemOps, X86::TuningSlowUAMem16,
+ X86::TuningPreferMaskRegisters, X86::TuningInsertVZEROUPPER,
+ X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts,
+ X86::TuningNoDomainDelay, X86::TuningNoDomainDelayMov,
+ X86::TuningNoDomainDelayShuffle, X86::TuningNoDomainDelayBlend,
+ X86::TuningPreferShiftShuffle, X86::TuningFastImmVectorShift,
X86::TuningFastDPWSSD,
// Perf-tuning flags.
- X86::TuningFastGather,
- X86::TuningSlowUAMem32,
+ X86::TuningFastGather, X86::TuningSlowUAMem32,
X86::TuningAllowLight256Bit,
// Based on whether user set the -mprefer-vector-width command line.
- X86::TuningPrefer128Bit,
- X86::TuningPrefer256Bit,
+ X86::TuningPrefer128Bit, X86::TuningPrefer256Bit,
// CPU name enums. These just follow CPU string.
- X86::ProcIntelAtom
- };
+ X86::ProcIntelAtom};
public:
explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Adding a tuning feature to resolve #84182. Generation of vpdpwssd (instead of vpmaddwd + vpaddd sequence) will be retained when the tuning is enabled.
Recreating after the previous fixup got broken.