official-stockfish · mstembera · Jul 12, 2023
diff --git a/src/nnue/layers/affine_transform_sparse_input.h b/src/nnue/layers/affine_transform_sparse_input.h
@@ -34,43 +34,15 @@
 */
 
 namespace Stockfish::Eval::NNUE::Layers {
-#if defined(__GNUC__)  // GCC, Clang, ICC
-
-  static inline IndexType lsb_(std::uint32_t b) {
-    assert(b);
-    return IndexType(__builtin_ctzl(b));
-  }
-
-#elif defined(_MSC_VER)  // MSVC
-
-  static inline IndexType lsb_(std::uint32_t b) {
-    assert(b);
-    unsigned long idx;
-    _BitScanForward(&idx, b);
-    return (IndexType) idx;
-  }
-
-#else  // Compiler is neither GCC nor MSVC compatible
-
-#error "Compiler not supported."
-
-#endif
-
 
 #if defined(USE_SSSE3)
   alignas(CacheLineSize) static inline const std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = [](){
     std::array<std::array<std::uint16_t, 8>, 256> v{};
-    for (int i = 0; i < 256; ++i)
+    for (unsigned i = 0; i < 256; ++i)
     {
-      int j = i;
-      int k = 0;
+      std::uint64_t j = i, k = 0;
       while(j)
-      {
-        const IndexType lsbIndex = lsb_(std::uint32_t(j));
-        j &= j - 1;
-        v[i][k] = lsbIndex;
-        ++k;
-      }
+        v[i][k++] = pop_lsb(j);
     }
     return v;
   }();
@@ -83,7 +55,11 @@ namespace Stockfish::Eval::NNUE::Layers {
     #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
 #elif defined (USE_AVX2)
     using vec_t = __m256i;
-    #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
+    #if defined(USE_VNNI) && !defined(USE_AVXVNNI)
+        #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
+    #else
+        #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
+    #endif
 #elif defined (USE_SSSE3)
     using vec_t = __m128i;
     #define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
@@ -97,8 +73,8 @@ namespace Stockfish::Eval::NNUE::Layers {
 
     const auto inputVector = reinterpret_cast<const vec_t*>(input);
     IndexType count = 0;
-    __m128i base = _mm_set1_epi16(0);
-    __m128i increment = _mm_set1_epi16(8);
+    __m128i base = _mm_setzero_si128();
+    const __m128i increment = _mm_set1_epi16(8);
     for (IndexType i = 0; i < NumChunks; ++i)
     {
       // bitmask of nonzero values in this chunk