[X86] Fast AVX-512-VNNI vpdpwssd tuning #85033

ganeshgit · 2024-03-13T07:19:32Z

Adding a tuning feature to resolve #84182. Generation of vpdpwssd (instead of vpmaddwd + vpaddd sequence) will be retained when the tuning is enabled.

llvmbot · 2024-03-13T07:20:05Z

@llvm/pr-subscribers-backend-x86

Author: Ganesh (ganeshgit)

Changes

Adding a tuning feature to resolve 84182. Generation of vpdpwssd (instead of vpmaddwd + vpaddd sequence) will be retained when the tuning is enabled.

Full diff: https://github.com/llvm/llvm-project/pull/85033.diff

6 Files Affected:

(modified) llvm/lib/Target/X86/X86.td (+11-1)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+1-1)
(modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+9-4)
(modified) llvm/lib/Target/X86/X86InstrPredicates.td (+1)
(modified) llvm/lib/Target/X86/X86TargetTransformInfo.h (+1)
(added) llvm/test/CodeGen/X86/vpdpwssd.ll (+15)

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddfa..3d426ec612bb2a 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastPWSSD
+    : SubtargetFeature<
+          "fast-pwssd", "HasFastPWSSD", "true",
+          "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
 def TuningPreferNoGather
     : SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
                        "Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
     !listconcat(ZN2Tuning, ZN3AdditionalTuning);
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
-  list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastPWSSD];
+  list<SubtargetFeature> ZN4Tuning =
+    !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
                                                   FeatureEVEX512,
                                                   FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2b5e3c0379a138..2367591152f7e1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48215,7 +48215,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 
   // We do not split for SSE at all, but we need to split vectors for AVX1 and
   // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
       TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
     SDValue LoX, HiX;
     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5dd..57010f1b02e1f1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10572,8 +10572,11 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDrm:
   case X86::VPDPWSSDYrr:
   case X86::VPDPWSSDYrm: {
-    Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+    if (!Subtarget.hasFastPWSSD()) {
+      Patterns.push_back(MachineCombinerPattern::DPWSSD);
+      return true;
+    }
+    return false;
   }
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128m:
@@ -10581,9 +10584,11 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-    if (Subtarget.hasBWI())
+    if (Subtarget.hasBWI() && !Subtarget.hasFastPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+      return true;
+    }
+    return false;
   }
   }
 }
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027ae..9fd846dd05c5d0 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -238,5 +238,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
+def HasFastPWSSD: Predicate<"Subtarget->hasFastPWSSD()">;
 def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
 def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa67..691e318cda3d33 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::TuningNoDomainDelayBlend,
       X86::TuningPreferShiftShuffle,
       X86::TuningFastImmVectorShift,
+      X86::TuningFastPWSSD,
 
       // Perf-tuning flags.
       X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 00000000000000..a044378fae255c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+  ret <16 x i32> %4
+}
+
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

github-actions · 2024-03-13T07:22:05Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff fe1d02b08ce2ca74de5b18d9f141d503b7985ec5 68e858d9668cead4682e329d636516ee35c06d26 -- clang/test/CodeGen/PowerPC/toc-data-attribute.c clang/test/CodeGen/PowerPC/toc-data-attribute.cpp clang/test/CodeGen/PowerPC/toc-data-diagnostics.c clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp clang/test/CodeGen/complex-math-mixed.c clang/test/Driver/toc-conf.c clang/test/Driver/tocdata-cc1.c clang/test/Sema/complex-arithmetic.c clang/test/Sema/const-init.c clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp flang/runtime/Float128Math/fma.cpp flang/runtime/Float128Math/random.cpp flang/runtime/random-templates.h clang/include/clang/Basic/CodeGenOptions.h clang/include/clang/InstallAPI/FrontendRecords.h clang/include/clang/Sema/Sema.h clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h clang/lib/AST/Decl.cpp clang/lib/AST/Interp/Interp.h clang/lib/CodeGen/CGDecl.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/lib/CodeGen/Targets/PPC.cpp clang/lib/Driver/ToolChains/AIX.cpp clang/lib/Frontend/CompilerInstance.cpp clang/lib/Sema/SemaDeclAttr.cpp clang/lib/Sema/SemaExpr.cpp clang/lib/Sema/SemaTemplate.cpp clang/lib/StaticAnalyzer/Core/CheckerContext.cpp clang/test/CodeGen/const-init.c clang/test/CodeGen/volatile.cpp clang/test/Sema/attr-cleanup.c clang/test/SemaCXX/cxx20-ctad-type-alias.cpp clang/unittests/Interpreter/InterpreterExtensionsTest.cpp compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp compiler-rt/lib/interception/interception.h compiler-rt/lib/sanitizer_common/sanitizer_asm.h compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp cross-project-tests/debuginfo-tests/llgdb-tests/forward-declare-class.cpp flang/lib/Frontend/CompilerInvocation.cpp flang/lib/Frontend/FrontendAction.cpp flang/lib/Lower/OpenMP/OpenMP.cpp flang/lib/Lower/OpenMP/ReductionProcessor.cpp flang/lib/Lower/OpenMP/ReductionProcessor.h flang/lib/Optimizer/Builder/IntrinsicCall.cpp flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp flang/lib/Semantics/check-omp-structure.cpp flang/runtime/Float128Math/math-entries.h flang/runtime/random.cpp libcxx/include/variant libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp libcxx/test/std/utilities/variant/variant.variant/variant.assign/conv.pass.cpp libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp libcxx/test/std/utilities/variant/variant.variant/variant.ctor/conv.pass.cpp libcxx/test/support/variant_test_helpers.h lldb/source/Expression/IRExecutionUnit.cpp llvm/include/llvm/ADT/STLExtras.h llvm/include/llvm/CodeGen/AccelTable.h llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h llvm/include/llvm/CodeGen/SDPatternMatch.h llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h llvm/include/llvm/IR/BasicBlock.h llvm/include/llvm/IR/DebugProgramInstruction.h llvm/include/llvm/IR/Instruction.h llvm/include/llvm/IR/PassManager.h llvm/include/llvm/Object/Archive.h llvm/include/llvm/Transforms/Scalar/Float2Int.h llvm/lib/Analysis/ValueTracking.cpp llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp llvm/lib/CodeGen/CodeGenPrepare.cpp llvm/lib/CodeGen/MIRPrinter.cpp llvm/lib/CodeGen/SelectOptimize.cpp llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp llvm/lib/CodeGen/SjLjEHPrepare.cpp llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp llvm/lib/IR/AsmWriter.cpp llvm/lib/IR/BasicBlock.cpp llvm/lib/IR/DebugInfo.cpp llvm/lib/IR/DebugProgramInstruction.cpp llvm/lib/IR/Instruction.cpp llvm/lib/IR/LLVMContextImpl.cpp llvm/lib/IR/LLVMContextImpl.h llvm/lib/MC/MCSectionXCOFF.cpp llvm/lib/Object/Archive.cpp llvm/lib/Object/ArchiveWriter.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.h llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp llvm/lib/Target/PowerPC/PPCFastISel.cpp llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp llvm/lib/Target/PowerPC/PPCInstrInfo.cpp llvm/lib/Target/PowerPC/PPCSubtarget.cpp llvm/lib/Target/PowerPC/PPCSubtarget.h llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp llvm/lib/Target/SPIRV/SPIRVUtils.h llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86InstrInfo.cpp llvm/lib/Target/X86/X86TargetTransformInfo.h llvm/lib/TargetParser/Host.cpp llvm/lib/Transforms/IPO/AttributorAttributes.cpp llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp llvm/lib/Transforms/InstCombine/InstructionCombining.cpp llvm/lib/Transforms/Scalar/Float2Int.cpp llvm/lib/Transforms/Utils/Local.cpp llvm/lib/Transforms/Utils/LoopRotationUtils.cpp llvm/lib/Transforms/Utils/SimplifyCFG.cpp llvm/lib/Transforms/Utils/ValueMapper.cpp llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp llvm/tools/llvm-ar/llvm-ar.cpp llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp llvm/tools/llvm-exegesis/lib/SubprocessMemory.h llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp llvm/unittests/IR/BasicBlockDbgInfoTest.cpp llvm/unittests/IR/DebugInfoTest.cpp llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp llvm/unittests/Transforms/Utils/DebugifyTest.cpp llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp llvm/utils/TableGen/DecoderEmitter.cpp mlir/include/mlir/Transforms/Inliner.h mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp mlir/lib/Transforms/InlinerPass.cpp mlir/lib/Transforms/Utils/Inliner.cpp

View the diff from clang-format here.

diff --git a/clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp b/clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp
index 31ff13f428..1a7b481a4c 100644
--- a/clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp
+++ b/clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp
@@ -53,7 +53,8 @@ TEST_F(IsCLibraryFunctionTest, AcceptsExternCGlobal) {
 }
 
 TEST_F(IsCLibraryFunctionTest, RejectsNoInlineNoExternalLinkage) {
-  // Functions that are neither inlined nor externally visible cannot be C library functions.
+  // Functions that are neither inlined nor externally visible cannot be C
+  // library functions.
   ASSERT_TRUE(buildAST(R"cpp(static void fun();)cpp"));
   EXPECT_FALSE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
 }
diff --git a/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp b/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp
index e5409f22f0..f9a809fc55 100644
--- a/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp
@@ -44,7 +44,8 @@ void SetThreadName(std::thread &thread, const std::string &name) {
 #if LIBFUZZER_LINUX || LIBFUZZER_FREEBSD
   (void)pthread_setname_np(thread.native_handle(), name.c_str());
 #elif LIBFUZZER_NETBSD
-  (void)pthread_setname_np(thread.native_handle(), "%s", const_cast<char *>(name.c_str()));
+  (void)pthread_setname_np(thread.native_handle(), "%s",
+                           const_cast<char *>(name.c_str()));
 #endif
 }
 
diff --git a/compiler-rt/lib/interception/interception.h b/compiler-rt/lib/interception/interception.h
index 38c152952e..029a1d7de3 100644
--- a/compiler-rt/lib/interception/interception.h
+++ b/compiler-rt/lib/interception/interception.h
@@ -191,12 +191,12 @@ const interpose_substitution substitution_##func_name[]             \
 #   define ASM_TYPE_FUNCTION_STR "@function"
 #  endif
 // Keep trampoline implementation in sync with sanitizer_common/sanitizer_asm.h
-#  define DECLARE_WRAPPER(ret_type, func, ...)                                 \
-     extern "C" ret_type func(__VA_ARGS__);                                    \
-     extern "C" ret_type TRAMPOLINE(func)(__VA_ARGS__);                        \
-     extern "C" ret_type __interceptor_##func(__VA_ARGS__)                     \
-       INTERCEPTOR_ATTRIBUTE __attribute__((weak)) ALIAS(WRAP(func));          \
-     asm(                                                                      \
+#    define DECLARE_WRAPPER(ret_type, func, ...)                         \
+      extern "C" ret_type func(__VA_ARGS__);                             \
+      extern "C" ret_type TRAMPOLINE(func)(__VA_ARGS__);                 \
+      extern "C" ret_type __interceptor_##func(__VA_ARGS__)              \
+          INTERCEPTOR_ATTRIBUTE __attribute__((weak)) ALIAS(WRAP(func)); \
+      asm(                                                                      \
        ".text\n"                                                               \
        __ASM_WEAK_WRAPPER(func)                                                \
        ".set " #func ", " SANITIZER_STRINGIFY(TRAMPOLINE(func)) "\n"           \
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_asm.h b/compiler-rt/lib/sanitizer_common/sanitizer_asm.h
index 30e9d15184..ac8b497ac6 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_asm.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_asm.h
@@ -43,11 +43,13 @@
 #endif
 
 #if defined(__aarch64__) && defined(__ARM_FEATURE_BTI_DEFAULT)
-# define ASM_STARTPROC CFI_STARTPROC; hint #34
-# define C_ASM_STARTPROC SANITIZER_STRINGIFY(CFI_STARTPROC) "\nhint #34"
+#  define ASM_STARTPROC \
+    CFI_STARTPROC;      \
+    hint #34
+#  define C_ASM_STARTPROC SANITIZER_STRINGIFY(CFI_STARTPROC) "\nhint #34"
 #else
-# define ASM_STARTPROC CFI_STARTPROC
-# define C_ASM_STARTPROC SANITIZER_STRINGIFY(CFI_STARTPROC)
+#  define ASM_STARTPROC CFI_STARTPROC
+#  define C_ASM_STARTPROC SANITIZER_STRINGIFY(CFI_STARTPROC)
 #endif
 #define ASM_ENDPROC CFI_ENDPROC
 #define C_ASM_ENDPROC SANITIZER_STRINGIFY(CFI_ENDPROC)
@@ -118,17 +120,16 @@
 #  define ASM_TRAMPOLINE_ALIAS(symbol, name)                                   \
          .weak symbol;                                                         \
          .set symbol, __interceptor_trampoline_##name
-#  define ASM_INTERCEPTOR_TRAMPOLINE(name)                                     \
-         .weak __interceptor_##name;                                           \
-         .set __interceptor_##name, ASM_WRAPPER_NAME(name);                    \
-         .globl __interceptor_trampoline_##name;                               \
-         ASM_TYPE_FUNCTION(__interceptor_trampoline_##name);                   \
-         __interceptor_trampoline_##name:                                      \
-                 ASM_STARTPROC;                                                \
-                 ASM_TAIL_CALL ASM_PREEMPTIBLE_SYM(__interceptor_##name);      \
-                 ASM_ENDPROC;                                                  \
-         ASM_SIZE(__interceptor_trampoline_##name)
-#  define ASM_INTERCEPTOR_TRAMPOLINE_SUPPORT 1
+#    define ASM_INTERCEPTOR_TRAMPOLINE(name)                   \
+      .weak __interceptor_##name;                              \
+      .set __interceptor_##name, ASM_WRAPPER_NAME(name);       \
+      .globl __interceptor_trampoline_##name;                  \
+      ASM_TYPE_FUNCTION(__interceptor_trampoline_##name);      \
+      __interceptor_trampoline_##name : ASM_STARTPROC;         \
+      ASM_TAIL_CALL ASM_PREEMPTIBLE_SYM(__interceptor_##name); \
+      ASM_ENDPROC;                                             \
+      ASM_SIZE(__interceptor_trampoline_##name)
+#    define ASM_INTERCEPTOR_TRAMPOLINE_SUPPORT 1
 # endif  // Architecture supports interceptor trampoline
 #else
 # define ASM_HIDDEN(symbol)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
index 7d7c009f64..6b7c02e5d1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
@@ -42,7 +42,7 @@ void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   cb(0, InfoProc->ki_rssize * GetPageSizeCached(), false, stats);
   UnmapOrDie(InfoProc, Size, true);
 }
-#elif SANITIZER_NETBSD
+#  elif SANITIZER_NETBSD
 void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   struct kinfo_proc2 *InfoProc;
   uptr Len = sizeof(*InfoProc);
@@ -55,7 +55,7 @@ void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   cb(0, InfoProc->p_vm_rssize * GetPageSizeCached(), false, stats);
   UnmapOrDie(InfoProc, Size, true);
 }
-#endif
+#  endif
 
 void ReadProcMaps(ProcSelfMapsBuff *proc_maps) {
   const int Mib[] = {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp
index 7214a2b9ea..9db035e0d2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp
@@ -145,7 +145,7 @@ void MemoryMappingLayout::DumpListOfModules(
   }
 }
 
-#if SANITIZER_LINUX || SANITIZER_ANDROID || SANITIZER_SOLARIS
+#  if SANITIZER_LINUX || SANITIZER_ANDROID || SANITIZER_SOLARIS
 void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   char *smaps = nullptr;
   uptr smaps_cap = 0;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 61b88edab0..bf3907a853 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10581,15 +10581,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-   if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
+    if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
       return true;
     }
     break;
   }
-  } 
-  return TargetInstrInfo::getMachineCombinerPatterns(Root, 
-                                                     Patterns, DoRegPressureReduce);
+  }
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+                                                     DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 23035f6550..bdaf9b2f13 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -42,72 +42,43 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::FeatureX86_64,
 
       // These features don't have any intrinsics or ABI effect.
-      X86::FeatureNOPL,
-      X86::FeatureCX16,
-      X86::FeatureLAHFSAHF64,
+      X86::FeatureNOPL, X86::FeatureCX16, X86::FeatureLAHFSAHF64,
 
       // Some older targets can be setup to fold unaligned loads.
       X86::FeatureSSEUnalignedMem,
 
       // Codegen control options.
-      X86::TuningFast11ByteNOP,
-      X86::TuningFast15ByteNOP,
-      X86::TuningFastBEXTR,
-      X86::TuningFastHorizontalOps,
-      X86::TuningFastLZCNT,
-      X86::TuningFastScalarFSQRT,
-      X86::TuningFastSHLDRotate,
-      X86::TuningFastScalarShiftMasks,
-      X86::TuningFastVectorShiftMasks,
+      X86::TuningFast11ByteNOP, X86::TuningFast15ByteNOP, X86::TuningFastBEXTR,
+      X86::TuningFastHorizontalOps, X86::TuningFastLZCNT,
+      X86::TuningFastScalarFSQRT, X86::TuningFastSHLDRotate,
+      X86::TuningFastScalarShiftMasks, X86::TuningFastVectorShiftMasks,
       X86::TuningFastVariableCrossLaneShuffle,
-      X86::TuningFastVariablePerLaneShuffle,
-      X86::TuningFastVectorFSQRT,
-      X86::TuningLEAForSP,
-      X86::TuningLEAUsesAG,
-      X86::TuningLZCNTFalseDeps,
-      X86::TuningBranchFusion,
-      X86::TuningMacroFusion,
-      X86::TuningPadShortFunctions,
-      X86::TuningPOPCNTFalseDeps,
-      X86::TuningMULCFalseDeps,
-      X86::TuningPERMFalseDeps,
-      X86::TuningRANGEFalseDeps,
-      X86::TuningGETMANTFalseDeps,
-      X86::TuningMULLQFalseDeps,
-      X86::TuningSlow3OpsLEA,
-      X86::TuningSlowDivide32,
-      X86::TuningSlowDivide64,
-      X86::TuningSlowIncDec,
-      X86::TuningSlowLEA,
-      X86::TuningSlowPMADDWD,
-      X86::TuningSlowPMULLD,
-      X86::TuningSlowSHLD,
-      X86::TuningSlowTwoMemOps,
-      X86::TuningSlowUAMem16,
-      X86::TuningPreferMaskRegisters,
-      X86::TuningInsertVZEROUPPER,
-      X86::TuningUseSLMArithCosts,
-      X86::TuningUseGLMDivSqrtCosts,
-      X86::TuningNoDomainDelay,
-      X86::TuningNoDomainDelayMov,
-      X86::TuningNoDomainDelayShuffle,
-      X86::TuningNoDomainDelayBlend,
-      X86::TuningPreferShiftShuffle,
-      X86::TuningFastImmVectorShift,
+      X86::TuningFastVariablePerLaneShuffle, X86::TuningFastVectorFSQRT,
+      X86::TuningLEAForSP, X86::TuningLEAUsesAG, X86::TuningLZCNTFalseDeps,
+      X86::TuningBranchFusion, X86::TuningMacroFusion,
+      X86::TuningPadShortFunctions, X86::TuningPOPCNTFalseDeps,
+      X86::TuningMULCFalseDeps, X86::TuningPERMFalseDeps,
+      X86::TuningRANGEFalseDeps, X86::TuningGETMANTFalseDeps,
+      X86::TuningMULLQFalseDeps, X86::TuningSlow3OpsLEA,
+      X86::TuningSlowDivide32, X86::TuningSlowDivide64, X86::TuningSlowIncDec,
+      X86::TuningSlowLEA, X86::TuningSlowPMADDWD, X86::TuningSlowPMULLD,
+      X86::TuningSlowSHLD, X86::TuningSlowTwoMemOps, X86::TuningSlowUAMem16,
+      X86::TuningPreferMaskRegisters, X86::TuningInsertVZEROUPPER,
+      X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts,
+      X86::TuningNoDomainDelay, X86::TuningNoDomainDelayMov,
+      X86::TuningNoDomainDelayShuffle, X86::TuningNoDomainDelayBlend,
+      X86::TuningPreferShiftShuffle, X86::TuningFastImmVectorShift,
       X86::TuningFastDPWSSD,
 
       // Perf-tuning flags.
-      X86::TuningFastGather,
-      X86::TuningSlowUAMem32,
+      X86::TuningFastGather, X86::TuningSlowUAMem32,
       X86::TuningAllowLight256Bit,
 
       // Based on whether user set the -mprefer-vector-width command line.
-      X86::TuningPrefer128Bit,
-      X86::TuningPrefer256Bit,
+      X86::TuningPrefer128Bit, X86::TuningPrefer256Bit,
 
       // CPU name enums. These just follow CPU string.
-      X86::ProcIntelAtom
-  };
+      X86::ProcIntelAtom};
 
 public:
   explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 2b764c9e1f..36e3b68f82 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -132,7 +132,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   SDValue Mul = DAG->getNode(ISD::MUL, DL, Int32VT, Add, Sub);
   SDValue And = DAG->getNode(ISD::AND, DL, Int32VT, Op0, Op1);
   SDValue Xor = DAG->getNode(ISD::XOR, DL, Int32VT, Op1, Op0);
-  SDValue Or  = DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op1);
+  SDValue Or = DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op1);
 
   SDValue SFAdd = DAG->getNode(ISD::STRICT_FADD, DL, {Float32VT, MVT::Other},
                                {DAG->getEntryNode(), Op2, Op2});

phoebewang · 2024-03-13T07:43:38Z

llvm/lib/Target/X86/X86ISelLowering.cpp

@@ -48215,7 +48215,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,

  // We do not split for SSE at all, but we need to split vectors for AVX1 and
  // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&


Unrelated change

phoebewang · 2024-03-13T07:46:16Z

llvm/lib/Target/X86/X86InstrInfo.cpp

@@ -10572,18 +10572,23 @@ bool X86InstrInfo::getMachineCombinerPatterns(
  case X86::VPDPWSSDrm:
  case X86::VPDPWSSDYrr:
  case X86::VPDPWSSDYrm: {
-    Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+    if (!Subtarget.hasFastPWSSD()) {


Put this at the beginning?

if (Subtarget.hasFastPWSSD()) return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce);

Can you remove the default, break if we don't use the MachineCombinerPattern::DPWSSD pattern and move this to after the switch:

return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce);

phoebewang · 2024-03-13T07:48:07Z

llvm/test/CodeGen/X86/vpdpwssd.ll

+
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }


RKSimon · 2024-03-13T10:17:00Z

llvm/lib/Target/X86/X86.td

+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastPWSSD
+    : SubtargetFeature<
+          "fast-pwssd", "HasFastPWSSD", "true",


Shouldn't it be fast-dpwssd?

RKSimon · 2024-03-13T10:17:32Z

llvm/test/CodeGen/X86/vpdpwssd.ll

@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py


The NOTE should be the first line to keep regeneration scripts happy

RKSimon · 2024-03-13T10:19:15Z

llvm/test/CodeGen/X86/vpdpwssd.ll

@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s


Add an extra RUN that takes the -mattr=+avx512vnni,+fast-dpwssd

RKSimon · 2024-03-13T10:21:35Z

llvm/lib/Target/X86/X86InstrInfo.cpp

@@ -10572,18 +10572,23 @@ bool X86InstrInfo::getMachineCombinerPatterns(
  case X86::VPDPWSSDrm:
  case X86::VPDPWSSDYrr:
  case X86::VPDPWSSDYrm: {
-    Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+    if (!Subtarget.hasFastPWSSD()) {


Can you remove the default, break if we don't use the MachineCombinerPattern::DPWSSD pattern and move this to after the switch:

return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce);

RKSimon

This patch needs to be cleanly rebased on trunk (force push is OK in PR branchs)

ganeshgit · 2024-03-14T10:56:59Z

This patch needs to be cleanly rebased on trunk (force push is OK in PR branchs)

I think my overnight rebase scripts screwed the fixups. I will close this and I will raise a new pull request.

llvmbot added the backend:X86 label Mar 13, 2024

ganeshgit requested review from RKSimon, phoebewang and bjacob March 13, 2024 07:20

phoebewang reviewed Mar 13, 2024

View reviewed changes

RKSimon reviewed Mar 13, 2024

View reviewed changes

ganeshgit requested review from JDevlieghere, Endilll, nikic and a team as code owners March 14, 2024 08:01

RKSimon requested changes Mar 14, 2024

View reviewed changes

Endilll removed their request for review March 14, 2024 15:09

ganeshgit closed this Mar 15, 2024

ganeshgit force-pushed the vpdpwssd branch from 68e858d to c3a1eb6 Compare March 15, 2024 04:23

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86] Fast AVX-512-VNNI vpdpwssd tuning #85033

[X86] Fast AVX-512-VNNI vpdpwssd tuning #85033

ganeshgit commented Mar 13, 2024 •

edited by phoebewang

llvmbot commented Mar 13, 2024

github-actions bot commented Mar 13, 2024 •

edited

phoebewang Mar 13, 2024

phoebewang Mar 13, 2024 •

edited

RKSimon Mar 13, 2024

phoebewang Mar 13, 2024

RKSimon Mar 13, 2024

RKSimon Mar 13, 2024

RKSimon Mar 13, 2024

RKSimon Mar 13, 2024

RKSimon left a comment

ganeshgit commented Mar 14, 2024 •

edited


		declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) #1

		attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

		@@ -0,0 +1,15 @@
		; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 \| FileCheck %s
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py

[X86] Fast AVX-512-VNNI vpdpwssd tuning #85033

[X86] Fast AVX-512-VNNI vpdpwssd tuning #85033

Conversation

ganeshgit commented Mar 13, 2024 • edited by phoebewang

llvmbot commented Mar 13, 2024

github-actions bot commented Mar 13, 2024 • edited

phoebewang Mar 13, 2024

Choose a reason for hiding this comment

phoebewang Mar 13, 2024 • edited

Choose a reason for hiding this comment

RKSimon Mar 13, 2024

Choose a reason for hiding this comment

phoebewang Mar 13, 2024

Choose a reason for hiding this comment

RKSimon Mar 13, 2024

Choose a reason for hiding this comment

RKSimon Mar 13, 2024

Choose a reason for hiding this comment

RKSimon Mar 13, 2024

Choose a reason for hiding this comment

RKSimon Mar 13, 2024

Choose a reason for hiding this comment

RKSimon left a comment

Choose a reason for hiding this comment

ganeshgit commented Mar 14, 2024 • edited

ganeshgit commented Mar 13, 2024 •

edited by phoebewang

github-actions bot commented Mar 13, 2024 •

edited

phoebewang Mar 13, 2024 •

edited

ganeshgit commented Mar 14, 2024 •

edited