Add AVX-VNNI support for Alder Lake and later.

In their infinite wisdom, Intel axed AVX512 from Alder Lake chips (well, not entirely, but we kind of want to use the Gracemont cores for chess!) but still added VNNI support. Confusingly enough, this is not the same as VNNI256 support. This adds a specific AVX-VNNI target that will use this AVX-VNNI mode, by prefixing the VNNI instructions with the appropriate VEX prefix, and avoiding AVX512 usage. This is about 1% faster on P cores: Result of 20 runs ================== base (./clang-bmi2 ) = 3306337 +/- 7519 test (./clang-vnni ) = 3344226 +/- 7388 diff = +37889 +/- 4153 speedup = +0.0115 P(speedup > 0) = 1.0000 But a nice 3% faster on E cores: Result of 20 runs ================== base (./clang-bmi2 ) = 1938054 +/- 28257 test (./clang-vnni ) = 1994606 +/- 31756 diff = +56552 +/- 3735 speedup = +0.0292 P(speedup > 0) = 1.0000 This was measured on Clang 13. GCC 11.2 appears to generate worse code for Alder Lake, though the speedup on the E cores is similar. It is possible to run the engine specifically on the P or E using binding, for example in linux it is possible to use (for an 8 P + 8 E setup like i9-12900K): taskset -c 0-15 ./stockfish taskset -c 16-23 ./stockfish where the first call binds to the P-cores and the second to the E-cores. closes #3824 No functional change
official-stockfish · Dec 3, 2021 · c9977aa · c9977aa
1 parent c1f9a35
commit c9977aa
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 5 deletions.
diff --git a/src/Makefile b/src/Makefile
@@ -78,6 +78,7 @@ endif
 # ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
 # sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
+# avxvnni = yes/no    --- -mavxvnni        --- Use Intel Vector Neural Network Instructions AVX
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
 # vnni256 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 256
 # vnni512 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
@@ -100,8 +101,8 @@ endif
 # explicitly check for the list of supported architectures (as listed with make help),
 # the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
 ifeq ($(ARCH), $(filter $(ARCH), \
-                 x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
-                 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                 x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \
+                 x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
                  x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \
                  armv7 armv7-neon armv8 apple-silicon general-64 general-32))
    SUPPORTED_ARCH=true
@@ -122,6 +123,7 @@ sse2 = no
 ssse3 = no
 sse41 = no
 avx2 = no
+avxvnni = no
 avx512 = no
 vnni256 = no
 vnni512 = no
@@ -192,6 +194,17 @@ ifeq ($(findstring -avx2,$(ARCH)),-avx2)
 	avx2 = yes
 endif
 
+ifeq ($(findstring -avxvnni,$(ARCH)),-avxvnni)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	avxvnni = yes
+	pext = yes
+endif
+
 ifeq ($(findstring -bmi2,$(ARCH)),-bmi2)
 	popcnt = yes
 	sse = yes
@@ -544,6 +557,13 @@ ifeq ($(avx2),yes)
 	endif
 endif
 
+ifeq ($(avxvnni),yes)
+	CXXFLAGS += -DUSE_VNNI -DUSE_AVXVNNI
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavxvnni
+	endif
+endif
+
 ifeq ($(avx512),yes)
 	CXXFLAGS += -DUSE_AVX512
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -689,6 +709,7 @@ help:
 	@echo "x86-64-vnni512          > x86 64-bit with vnni support 512bit wide"
 	@echo "x86-64-vnni256          > x86 64-bit with vnni support 256bit wide"
 	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
+	@echo "x86-64-avxvnni          > x86 64-bit with avxvnni support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
 	@echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support"
@@ -837,6 +858,7 @@ config-sanity: net
 	@echo "ssse3: '$(ssse3)'"
 	@echo "sse41: '$(sse41)'"
 	@echo "avx2: '$(avx2)'"
+	@echo "avxvnni: '$(avxvnni)'"
 	@echo "avx512: '$(avx512)'"
 	@echo "vnni256: '$(vnni256)'"
 	@echo "vnni512: '$(vnni512)'"

diff --git a/src/simd.h b/src/simd.h
@@ -46,6 +46,13 @@
 #define USE_INLINE_ASM
 #endif
 
+// Use either the AVX512 or AVX-VNNI version of the VNNI instructions.
+#if defined(USE_AVXVNNI)
+#define VNNI_PREFIX "%{vex%} "
+#else
+#define VNNI_PREFIX ""
+#endif
+
 namespace Stockfish::Simd {
 
 #if defined (USE_AVX512)
@@ -208,7 +215,7 @@ namespace Stockfish::Simd {
 # if defined (USE_VNNI)
 #   if defined (USE_INLINE_ASM)
       asm(
-        "vpdpbusd %[b], %[a], %[acc]\n\t"
+        VNNI_PREFIX "vpdpbusd %[b], %[a], %[acc]\n\t"
         : [acc]"+v"(acc)
         : [a]"v"(a), [b]"vm"(b)
       );
@@ -240,8 +247,8 @@ namespace Stockfish::Simd {
 # if defined (USE_VNNI)
 #   if defined (USE_INLINE_ASM)
       asm(
-        "vpdpbusd %[b0], %[a0], %[acc]\n\t"
-        "vpdpbusd %[b1], %[a1], %[acc]\n\t"
+        VNNI_PREFIX "vpdpbusd %[b0], %[a0], %[acc]\n\t"
+        VNNI_PREFIX "vpdpbusd %[b1], %[a1], %[acc]\n\t"
         : [acc]"+v"(acc)
         : [a0]"v"(a0), [b0]"vm"(b0), [a1]"v"(a1), [b1]"vm"(b1)
       );