Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[X86] Fast AVX-512-VNNI vpdpwssd tuning #85375

Merged
merged 1 commit into from
Mar 15, 2024
Merged

Conversation

ganeshgit
Copy link
Contributor

Adding a tuning feature to resolve #84182. Generation of vpdpwssd (instead of vpmaddwd + vpaddd sequence) will be retained when the tuning is enabled.

Recreating after the previous fixup got broken.

Change-Id: I2eeac5c6c4aadd15a51abdd457c36aa641cf6e8c
@llvmbot
Copy link
Collaborator

llvmbot commented Mar 15, 2024

@llvm/pr-subscribers-backend-x86

Author: Ganesh (ganeshgit)

Changes

Adding a tuning feature to resolve #84182. Generation of vpdpwssd (instead of vpmaddwd + vpaddd sequence) will be retained when the tuning is enabled.

Recreating after the previous fixup got broken.


Full diff: https://github.com/llvm/llvm-project/pull/85375.diff

5 Files Affected:

  • (modified) llvm/lib/Target/X86/X86.td (+11-1)
  • (modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+11-7)
  • (modified) llvm/lib/Target/X86/X86InstrPredicates.td (+1)
  • (modified) llvm/lib/Target/X86/X86TargetTransformInfo.h (+1)
  • (added) llvm/test/CodeGen/X86/vpdpwssd.ll (+12)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddfa..78bc043911f2fc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastDPWSSD
+    : SubtargetFeature<
+          "fast-dpwssd", "HasFastDPWSSD", "true",
+          "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
 def TuningPreferNoGather
     : SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
                        "Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
     !listconcat(ZN2Tuning, ZN3AdditionalTuning);
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
-  list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD];
+  list<SubtargetFeature> ZN4Tuning =
+    !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
                                                   FeatureEVEX512,
                                                   FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5dd..43ae6fd590745c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10565,15 +10565,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
     bool DoRegPressureReduce) const {
   unsigned Opc = Root.getOpcode();
   switch (Opc) {
-  default:
-    return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
-                                                       DoRegPressureReduce);
   case X86::VPDPWSSDrr:
   case X86::VPDPWSSDrm:
   case X86::VPDPWSSDYrr:
   case X86::VPDPWSSDYrm: {
-    Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+    if (!Subtarget.hasFastDPWSSD()) {
+      Patterns.push_back(MachineCombinerPattern::DPWSSD);
+      return true;
+    }
+    break;
   }
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128m:
@@ -10581,11 +10581,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-    if (Subtarget.hasBWI())
+   if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+      return true;
+    }
+    break;
   }
   }
+  return TargetInstrInfo::getMachineCombinerPatterns(Root,
+                                                     Patterns, DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027ae..b8e7768bdaf3c4 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -238,5 +238,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
+def HasFastDPWSSD: Predicate<"Subtarget->hasFastDPWSSD()">;
 def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
 def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa67..23035f655098a7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::TuningNoDomainDelayBlend,
       X86::TuningPreferShiftShuffle,
       X86::TuningFastImmVectorShift,
+      X86::TuningFastDPWSSD,
 
       // Perf-tuning flags.
       X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 00000000000000..e6a07b4aeb2719
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+  ret <16 x i32> %4
+}

Copy link

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff 141145232f915b44aef6e3854f091da03c41a2b6 df56289a07dd121257df6b902dec16a0cb823ed8 -- llvm/lib/Target/X86/X86InstrInfo.cpp llvm/lib/Target/X86/X86TargetTransformInfo.h
View the diff from clang-format here.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 43ae6fd590..bf3907a853 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10581,15 +10581,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-   if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
+    if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
       return true;
     }
     break;
   }
   }
-  return TargetInstrInfo::getMachineCombinerPatterns(Root,
-                                                     Patterns, DoRegPressureReduce);
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+                                                     DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 23035f6550..bdaf9b2f13 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -42,72 +42,43 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::FeatureX86_64,
 
       // These features don't have any intrinsics or ABI effect.
-      X86::FeatureNOPL,
-      X86::FeatureCX16,
-      X86::FeatureLAHFSAHF64,
+      X86::FeatureNOPL, X86::FeatureCX16, X86::FeatureLAHFSAHF64,
 
       // Some older targets can be setup to fold unaligned loads.
       X86::FeatureSSEUnalignedMem,
 
       // Codegen control options.
-      X86::TuningFast11ByteNOP,
-      X86::TuningFast15ByteNOP,
-      X86::TuningFastBEXTR,
-      X86::TuningFastHorizontalOps,
-      X86::TuningFastLZCNT,
-      X86::TuningFastScalarFSQRT,
-      X86::TuningFastSHLDRotate,
-      X86::TuningFastScalarShiftMasks,
-      X86::TuningFastVectorShiftMasks,
+      X86::TuningFast11ByteNOP, X86::TuningFast15ByteNOP, X86::TuningFastBEXTR,
+      X86::TuningFastHorizontalOps, X86::TuningFastLZCNT,
+      X86::TuningFastScalarFSQRT, X86::TuningFastSHLDRotate,
+      X86::TuningFastScalarShiftMasks, X86::TuningFastVectorShiftMasks,
       X86::TuningFastVariableCrossLaneShuffle,
-      X86::TuningFastVariablePerLaneShuffle,
-      X86::TuningFastVectorFSQRT,
-      X86::TuningLEAForSP,
-      X86::TuningLEAUsesAG,
-      X86::TuningLZCNTFalseDeps,
-      X86::TuningBranchFusion,
-      X86::TuningMacroFusion,
-      X86::TuningPadShortFunctions,
-      X86::TuningPOPCNTFalseDeps,
-      X86::TuningMULCFalseDeps,
-      X86::TuningPERMFalseDeps,
-      X86::TuningRANGEFalseDeps,
-      X86::TuningGETMANTFalseDeps,
-      X86::TuningMULLQFalseDeps,
-      X86::TuningSlow3OpsLEA,
-      X86::TuningSlowDivide32,
-      X86::TuningSlowDivide64,
-      X86::TuningSlowIncDec,
-      X86::TuningSlowLEA,
-      X86::TuningSlowPMADDWD,
-      X86::TuningSlowPMULLD,
-      X86::TuningSlowSHLD,
-      X86::TuningSlowTwoMemOps,
-      X86::TuningSlowUAMem16,
-      X86::TuningPreferMaskRegisters,
-      X86::TuningInsertVZEROUPPER,
-      X86::TuningUseSLMArithCosts,
-      X86::TuningUseGLMDivSqrtCosts,
-      X86::TuningNoDomainDelay,
-      X86::TuningNoDomainDelayMov,
-      X86::TuningNoDomainDelayShuffle,
-      X86::TuningNoDomainDelayBlend,
-      X86::TuningPreferShiftShuffle,
-      X86::TuningFastImmVectorShift,
+      X86::TuningFastVariablePerLaneShuffle, X86::TuningFastVectorFSQRT,
+      X86::TuningLEAForSP, X86::TuningLEAUsesAG, X86::TuningLZCNTFalseDeps,
+      X86::TuningBranchFusion, X86::TuningMacroFusion,
+      X86::TuningPadShortFunctions, X86::TuningPOPCNTFalseDeps,
+      X86::TuningMULCFalseDeps, X86::TuningPERMFalseDeps,
+      X86::TuningRANGEFalseDeps, X86::TuningGETMANTFalseDeps,
+      X86::TuningMULLQFalseDeps, X86::TuningSlow3OpsLEA,
+      X86::TuningSlowDivide32, X86::TuningSlowDivide64, X86::TuningSlowIncDec,
+      X86::TuningSlowLEA, X86::TuningSlowPMADDWD, X86::TuningSlowPMULLD,
+      X86::TuningSlowSHLD, X86::TuningSlowTwoMemOps, X86::TuningSlowUAMem16,
+      X86::TuningPreferMaskRegisters, X86::TuningInsertVZEROUPPER,
+      X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts,
+      X86::TuningNoDomainDelay, X86::TuningNoDomainDelayMov,
+      X86::TuningNoDomainDelayShuffle, X86::TuningNoDomainDelayBlend,
+      X86::TuningPreferShiftShuffle, X86::TuningFastImmVectorShift,
       X86::TuningFastDPWSSD,
 
       // Perf-tuning flags.
-      X86::TuningFastGather,
-      X86::TuningSlowUAMem32,
+      X86::TuningFastGather, X86::TuningSlowUAMem32,
       X86::TuningAllowLight256Bit,
 
       // Based on whether user set the -mprefer-vector-width command line.
-      X86::TuningPrefer128Bit,
-      X86::TuningPrefer256Bit,
+      X86::TuningPrefer128Bit, X86::TuningPrefer256Bit,
 
       // CPU name enums. These just follow CPU string.
-      X86::ProcIntelAtom
-  };
+      X86::ProcIntelAtom};
 
 public:
   explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM - cheers

@ganeshgit ganeshgit merged commit 61fadd0 into llvm:main Mar 15, 2024
4 of 6 checks passed
@ganeshgit ganeshgit deleted the vpdpwssd branch March 15, 2024 15:31
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

vpdpwssd instruction not generated despite giving better performance than vpmaddwd+vpaddd expansion
3 participants